Crawler

  1. <?php
  2. // $Id$
  3.  
  4.  
  5.  /**
  6.   *  cURL class
  7.   */
  8.   class drupal_security_scanner_test {
  9.  
  10.   var $ch;
  11.   var $curl_options = array();
  12.   var $elements;
  13.  
  14.   /**
  15.    * Initializes the cURL connection and gets a session cookie.
  16.    *
  17.    * This function will add authentaticon headers as specified in
  18.    * simpletest_httpauth_username and simpletest_httpauth_pass variables.
  19.    * Also, see the description of $curl_options among the properties.
  20.    */
  21.   function curlConnect() {
  22.     global $base_url, $db_prefix;
  23.     if (!isset($this->ch)) {
  24.       $this->ch = curl_init();
  25.       $curl_options = $this->curl_options + array(
  26.         CURLOPT_COOKIEJAR => $this->cookie_file,
  27.         CURLOPT_URL => $base_url,
  28.         CURLOPT_FOLLOWLOCATION => TRUE,
  29.         CURLOPT_RETURNTRANSFER => TRUE,
  30.       );
  31.       if (preg_match('/simpletest\d+/', $db_prefix)) {
  32.         $curl_options[CURLOPT_USERAGENT] = $db_prefix;
  33.       }
  34.       if (!isset($curl_options[CURLOPT_USERPWD]) && ($auth = variable_get('simpletest_httpauth_username', ''))) {
  35.         if ($pass = variable_get('simpletest_httpauth_pass', '')) {
  36.           $auth .= ':' . $pass;
  37.         }
  38.         $curl_options[CURLOPT_USERPWD] = $auth;
  39.       }
  40.       return $this->curlExec($curl_options);
  41.     }
  42.   }
  43.  
  44.   /**
  45.    * Peforms a cURL exec with the specified options after calling curlConnect().
  46.    *
  47.    * @param array $curl_options Custom cURL options.
  48.    * @return string Content returned from the exec.
  49.    */
  50.   function curlExec($curl_options) {
  51.     $this->curlConnect();
  52.     $url = empty($curl_options[CURLOPT_URL]) ? curl_getinfo($this->ch, CURLINFO_EFFECTIVE_URL) : $curl_options[CURLOPT_URL];
  53.     curl_setopt_array($this->ch, $this->curl_options + $curl_options);
  54.     $this->_content = curl_exec($this->ch);
  55.     $this->plain_text = FALSE;
  56.     $this->elements = FALSE;
  57.     return $this->_content;
  58.   }
  59.  
  60.   /**
  61.    * Close the cURL handler and unset the handler.
  62.    */
  63.   function curlClose() {
  64.     if (isset($this->ch)) {
  65.       curl_close($this->ch);
  66.       unset($this->ch);
  67.     }
  68.   }
  69.  
  70.     /**
  71.    * Parse content returned from curlExec using DOM and simplexml.
  72.    *
  73.    * @return SimpleXMLElement A SimpleXMLElement or FALSE on failure.
  74.    */
  75.   function parse() {
  76.     if (!$this->elements) {
  77.       // DOM can load HTML soup. But, HTML soup can throw warnings, supress
  78.       // them.
  79.       @$htmlDom = DOMDocument::loadHTML($this->_content);
  80.       if ($htmlDom) {
  81.         // It's much easier to work with simplexml than DOM, luckily enough
  82.         // we can just simply import our DOM tree.
  83.         $this->elements = simplexml_import_dom($htmlDom);
  84.       }
  85.     }
  86.     if (!$this->elements) {
  87.       return "FALSE";
  88.     }
  89.     return $this->elements;
  90.   }
  91.  
  92.   /**
  93.    * Retrieves a Drupal path or an absolute path.
  94.    *
  95.    * @param $path string Drupal path or url to load into internal browser
  96.    * @param array $options Options to be forwarded to url().
  97.    * @return The retrieved HTML string, also available as $this->drupalGetContent()
  98.    */
  99.   function drupalGet($path, $options = array()) {
  100.     $options['absolute'] = TRUE;
  101.  
  102.     // We re-using a CURL connection here.  If that connection still has certain
  103.     // options set, it might change the GET into a POST.  Make sure we clear out
  104.     // previous options.
  105.     return $this->curlExec(array(CURLOPT_URL => url($path, $options), CURLOPT_POST => FALSE, CURLOPT_POSTFIELDS => array()));
  106.   }
  107. }  
  108.  
  109.   /**
  110.    * Implementation of hook_menu().
  111.    */
  112.   function security_scanner_menu() {
  113.   $items['admin/settings/security_scanner'] = array(
  114.     'title' => 'Security Scanner',
  115.     'page callback' => 'page_security_scanner',
  116.     'access arguments' => array('access scanner'),
  117.     'type' => MENU_NORMAL_ITEM,
  118.   );
  119.   return $items;
  120.   }
  121.  
  122.   /**
  123.    *  Implementation of the crawler page.
  124.    */    
  125.   function page_security_scanner() {
  126.     $initial_path = 'http://localhost/soc2008/';
  127.     db_query("INSERT INTO {crawler_links} VALUES ('','%s','','','')", $initial_path);
  128.     $time = time() + 5;
  129.     while (time() < $time) {
  130.       // Initialize the crawler
  131.       db_query('INSERT INTO {crawler} VALUES (default)');
  132.       $crawler_id = db_last_insert_id('crawler', 'id');
  133.       //Mark the extracted page as visited
  134.       db_query("UPDATE {crawler_links} SET crawler_id = %d WHERE crawler_id = 0 LIMIT 1", $crawler_id);
  135.       // Get the link from crawler_links table
  136.       $result = db_query("SELECT path FROM {crawler_links} WHERE crawler_id = %d AND status = 0 LIMIT 1", $crawler_id);
  137.       $page_to_visit = db_fetch_array($result);
  138.       // Update the status field to sign as executed that link
  139.       db_query("UPDATE {crawler_links} SET status = 1 WHERE crawler_id = %d and status = 0 LIMIT 1", $crawler_id);
  140.       //Create a new object and parse the page
  141.       $obj = new drupal_security_scanner_test();
  142.       $obj->drupalGet($page_to_visit[0]);
  143.       $obj->parse();
  144.       $links = $obj->elements->xpath('//a');
  145.       foreach($links as $link) {
  146.         // Here I have to remove every link that exit from the domain.
  147.         db_query("INSERT INTO crawler_links VALUES ('','%s','','','')", $link['href']);
  148.       }
  149.     }
  150.     return '<p>'. t('The quick brown fox jumps over the lazy dog.') .'</p>';
  151.   }
  152.  
  153.   /**
  154.    *  Implementation of hook _perm()
  155.    */  
  156.   function security_scanner_perm() {
  157.     return array('access scanner');
  158.   }
  159.  
  160.   /**
  161.    *  Implementation of hook _help()
  162.    */
  163.    function security_scanner_help($path, $arg) {
  164.      switch ($path) {
  165.        case 'security_scanner':
  166.        // Here is some help text for a custom page.
  167.          return t('This sentence contains all the letters in the English alphabet.');
  168.      }
  169.    }
  170.        
  171.    
  172. ?>