Fix for Crawler

  1.  function crawler_framework($selection_query, $initial_status, $function_callback) {
  2.     // Initialize the crawler
  3.     db_query("INSERT INTO {crawler} VALUES (default)");
  4.     $crawler_id = db_last_insert_id('crawler', 'id');
  5.     $selection_query += array(
  6.       'join' => '',
  7.       'where' => '',
  8.       'parameters' => array(),
  9.     );
  10.     $fields = empty($selection_query['fields']) ? '' : ','. implode(', ', $selection_query['fields']);
  11.     $sql = 'SELECT l.path'. $fields .' FROM {crawler_links} AS l '. $selection_query['join'] .' WHERE crawler_id = %d and status = %d' . $selection_query['where'];
  12.     array_unshift($selection_query['parameters'], $crawler_id, $status);
  13.     $time = time() + 150;
  14.     while (time() < $time) {
  15.       $status = $initial_status;
  16.       //Mark the extracted page as visited
  17.       $status++;
  18.       db_query("UPDATE {crawler_links} SET crawler_id = %d, status = %d WHERE crawler_id = 0 LIMIT 1", $crawler_id, $status);
  19.       // Get the link from crawler_links table
  20.       $selected_results = db_fetch_array(db_query_range($sql, $selection_query['parameters'], 0, 1));
  21.       // Update the status field to sign as executed that link
  22.       // (The following two lines could be move to the end of the function i think without problems)
  23.       db_query("UPDATE {crawler_links} SET status = status + 1 WHERE status = %d AND crawler_id = %d", $status, $crawler_id);
  24.       $status++;
  25.       // Create a new object and parse the page
  26.       $obj = new drupal_security_scanner_test();
  27.       // Set the cookie
  28.       $session_cookie = variable_get('security_scanner_cookie','');
  29.       $obj->curl_options = array(
  30.         CURLOPT_COOKIE => $session_cookie,
  31.       );
  32.       $obj->drupalGet($selected_results['path']);
  33.       $obj->parse();
  34.       $function_callback($obj, $selected_results);
  35.       $obj->curlClose();
  36.     }
  37.  }
  38.  
  39.   /**
  40.    *  Crawler: page processing function
  41.    */    
  42.   function security_scanner_page_processing($obj, $selected_results) {
  43.     global $base_url;
  44.     $links = $obj->elements->xpath('//a');
  45.     foreach($links as $link) {
  46.       $url_to_save = (string)$link->attributes()->href;
  47.       $absolute = getAbsoluteUrl($url_to_save);
  48.       // Get the page but check:
  49.       // a - if it's logout link, that makes me lose the cookie.
  50.       // b - if it's security scanner, skip
  51.       // c - if it's xss_injector, skip. That will launch the crawler
  52.       // d - if it's cron.php, that will make a loop
  53.       $parsed_url = parse_url($absolute);
  54.       if (($parsed_url['query'] != 'q=logout') && ($parsed_url['query'] != 'q=admin/settings/security_scanner') && ($parsed_url['query'] != 'q=admin/settings/xss_injector') && ($parsed_url['file'] != 'cron.php')) {  
  55.         if (substr($absolute, 0, strlen($base_url)) == $base_url) {
  56.           // Here we use IGNORE to insert only one time a link into the table. ("path" is a unique index)
  57.           db_query("INSERT IGNORE INTO {crawler_links} VALUES ('','%s','','')", $absolute);
  58.         }
  59.       }
  60.     }
  61.     // Get the forms inside the page
  62.     $inputs = $obj->elements->xpath("//input[@name='form_id']");
  63.     foreach($inputs as $input) {
  64.       $form_id = (string)$input->attributes()->id;
  65.       // Debug line! HAS TO BE REMOVED
  66.       echo $form_id."Form inserted! <br />";
  67.       // Here we use again IGNORE to insert only one time a form_id into the table. ("form_id" is the primary key)
  68.       db_query("INSERT IGNORE INTO {crawler_forms} VALUES ('%s','%d')", $form_id, $selected_results['id']);
  69.     }
  70.   }
  71.  
  72.   /**
  73.    *  Implementation of the crawler page.
  74.    */    
  75.   function security_scanner_cron() {
  76.     //  Check if the auth session cookie value is already into the db, otherwise call
  77.     //  the function that retrieve this (enable multithreading)
  78.     if (variable_get('security_scanner_cookie','') == '') {
  79.       drupal_security_scanner_get_auth_cookie();
  80.     }
  81.     // "SELECT id,path FROM {crawler_links} WHERE crawler_id = %d AND status = 1 LIMIT 1", $crawler_id
  82.     $selection_query = array(
  83.       'fields' => array('l.id'),
  84.     );
  85.     crawler_framework($selection_query, 0, 'security_scanner_page_processing');
  86.   }