Crawler

  1.  /**
  2.    *  Implementation of the crawler function.
  3.    *  Flow:
  4.    *  1- set crawler_id but leave status = 0
  5.    *  2- make status = 1
  6.    *  3- make status = 2            
  7.    */    
  8.   function crawler_framework($selection_query, $status, $function_callback) {
  9.     // Initialize the crawler
  10.     db_query("INSERT INTO {crawler} VALUES (default)");
  11.     $crawler_id = db_last_insert_id('crawler', 'id');
  12.     // Adding the parameter $crawler_id to the query params.
  13.     $selection_query['parameters'][0] = $crawler_id;
  14.     //Mark the extracted page as visited
  15.     $status++;
  16.     db_query("UPDATE {crawler_links} SET crawler_id = %d, status = %d WHERE crawler_id = 0", $crawler_id, $status);
  17.     // Get the link from crawler_links table
  18.     $selected_results = db_fetch_array(db_query_range("SELECT l.path". $selection_query['select'] ." FROM {crawler_links} AS l ". $selection_query['from'] ." ". $selection_query['where'], (implode(",", $selection_query['parameters'])), $status, 0, 1));
  19.     // Update the status field to sign as executed that link
  20.     // (The following two lines could be move to the end of the function i think without problems)
  21.     $status++;
  22.     db_query("UPDATE {crawler_links} SET status = %d WHERE crawler_id = %d and status = %d", $status, $crawler_id, $status-1);
  23.     // Create a new object and parse the page
  24.     $obj = new drupal_security_scanner_test();
  25.     // Set the cookie
  26.     $session_cookie = variable_get('security_scanner_cookie','');
  27.     $obj->curl_options = array(
  28.       CURLOPT_COOKIE => $session_cookie,
  29.     );
  30.     $obj->drupalGet($selected_results['path']);
  31.     $obj->parse();
  32.     $function_callback($obj, $selected_results);
  33.     $obj->curlClose();
  34.   }
  35.  
  36.   /**
  37.    *  Crawler: page processing function
  38.    */    
  39.   function security_scanner_page_processing($obj, $selected_results) {
  40.     global $base_url;
  41.     $links = $obj->elements->xpath('//a');
  42.     foreach($links as $link) {
  43.       $url_to_save = (string)$link->attributes()->href;
  44.       $absolute = getAbsoluteUrl($url_to_save);
  45.       // Get the page but check:
  46.       // a - if it's logout link, that makes me lose the cookie.
  47.       // b - if it's security scanner, skip
  48.       // c - if it's xss_injector, skip. That will launch the crawler
  49.       // d - if it's cron.php, that will make a loop
  50.       $parsed_url = parse_url($absolute);
  51.       if (($parsed_url['query'] != 'q=logout') && ($parsed_url['query'] != 'q=admin/settings/security_scanner') && ($parsed_url['query'] != 'q=admin/settings/xss_injector') && ($parsed_url['file'] != 'cron.php')) {  
  52.         if (substr($absolute, 0, strlen($base_url)) == $base_url) {
  53.           // Here we use IGNORE to insert only one time a link into the table. ("path" is a unique index)
  54.           db_query("INSERT IGNORE INTO {crawler_links} VALUES ('','%s','','')", $absolute);
  55.         }
  56.       }
  57.     }
  58.     // Get the forms inside the page
  59.     $inputs = $obj->elements->xpath("//input[@name='form_id']");
  60.     foreach($inputs as $input) {
  61.       $form_id = (string)$input->attributes()->id;
  62.       // Debug line! HAS TO BE REMOVED
  63.       echo $form_id."Form inserted! <br />";
  64.       // Here we use again IGNORE to insert only one time a form_id into the table. ("form_id" is the primary key)
  65.       db_query("INSERT IGNORE INTO {crawler_forms} VALUES ('%s','%d')", $form_id, $selected_results['id']);
  66.     }
  67.   }
  68.  
  69.   /**
  70.    *  Implementation of the crawler page.
  71.    */    
  72.   function security_scanner_cron() {
  73.     //  Check if the auth session cookie value is already into the db, otherwise call
  74.     //  the function that retrieve this (enable multithreading)
  75.     if (variable_get('security_scanner_cookie','') == '') {
  76.       drupal_security_scanner_get_auth_cookie();
  77.     }
  78.     // "SELECT id,path FROM {crawler_links} WHERE crawler_id = %d AND status = 1 LIMIT 1", $crawler_id
  79.     $selection_query = array(
  80.       'select' => ',l.id',
  81.       'from' => '',
  82.       'where' => 'WHERE crawler_id = %d and status = %d',
  83.       'parameters' => array(),
  84.     );
  85.     $time = time() + 30;
  86.     while (time() < $time) {
  87.       crawler_framework($selection_query, 0, 'security_scanner_page_processing');
  88.     }
  89.   }
  90.