Fix for Crawler

  1. <?php
  2.   /**
  3.    *  Implementation of the crawler function.
  4.    *  Flow:
  5.    *  1- set crawler_id but leave status = 0
  6.    *  2- make status = 1
  7.    *  3- make status = 2
  8.    */
  9.   function crawler_framework($selection_query, $status, $function_callback) {
  10.     // Initialize the crawler
  11.     db_query("INSERT INTO {crawler} VALUES (default)");
  12.     $crawler_id = db_last_insert_id('crawler', 'id');
  13.     // Adding the parameter $crawler_id to the query params.
  14.     $selection_query['parameters'][0] = $crawler_id;
  15.     //Mark the extracted page as visited
  16.     $status++;
  17.     db_query("UPDATE {crawler_links} SET crawler_id = %d, status = %d WHERE crawler_id = 0", $crawler_id, $status);
  18.     // Get the link from crawler_links table
  19.     $selection_query += array(
  20.       'join' => '',
  21.       'where' => '',
  22.       'parameters' => array(),
  23.     );
  24.     $fields = empty($selection_query['fields']) ? '' : ','. implode(', ', $selection_query['fields']);
  25.     $sql = 'SELECT l.path'. $fields .' FROM {crawler_links} AS l '. $selection_query['join'] .' WHERE crawler_id = %d and status = %d';
  26.     array_unshift($selection_query['parameters'], $crawler_id, $status);
  27.     $selected_results = db_fetch_array(db_query_range($sql, $parameters, 0, 1));
  28.     // Update the status field to sign as executed that link
  29.     // (The following two lines could be move to the end of the function i think without problems)
  30.     $status++;
  31.     db_query("UPDATE {crawler_links} SET status = %d WHERE crawler_id = %d and status = %d", $status, $crawler_id, $status-1);
  32.     // Create a new object and parse the page
  33.     $obj = new drupal_security_scanner_test();
  34.     // Set the cookie
  35.     $session_cookie = variable_get('security_scanner_cookie','');
  36.     $obj->curl_options = array(
  37.       CURLOPT_COOKIE => $session_cookie,
  38.     );
  39.     $obj->drupalGet($selected_results['path']);
  40.     $obj->parse();
  41.     $function_callback($obj, $selected_results);
  42.     $obj->curlClose();
  43.   }
  44.  
  45.   /**
  46.    *  Crawler: page processing function
  47.    */    
  48.   function security_scanner_page_processing($obj, $selected_results) {
  49.     global $base_url;
  50.     $links = $obj->elements->xpath('//a');
  51.     foreach($links as $link) {
  52.       $url_to_save = (string)$link->attributes()->href;
  53.       $absolute = getAbsoluteUrl($url_to_save);
  54.       // Get the page but check:
  55.       // a - if it's logout link, that makes me lose the cookie.
  56.       // b - if it's security scanner, skip
  57.       // c - if it's xss_injector, skip. That will launch the crawler
  58.       // d - if it's cron.php, that will make a loop
  59.       $parsed_url = parse_url($absolute);
  60.       if (($parsed_url['query'] != 'q=logout') && ($parsed_url['query'] != 'q=admin/settings/security_scanner') && ($parsed_url['query'] != 'q=admin/settings/xss_injector') && ($parsed_url['file'] != 'cron.php')) {  
  61.   if (substr($absolute, 0, strlen($base_url)) == $base_url) {
  62.     // Here we use IGNORE to insert only one time a link into the table. ("path" is a unique index)
  63.           db_query("INSERT IGNORE INTO {crawler_links} VALUES ('','%s','','')", $absolute);
  64.         }
  65.       }
  66.     }
  67.     // Get the forms inside the page
  68.     $inputs = $obj->elements->xpath("//input[@name='form_id']");
  69.     foreach($inputs as $input) {
  70.       $form_id = (string)$input->attributes()->id;
  71.       // Debug line! HAS TO BE REMOVED
  72.       echo $form_id."Form inserted! <br />";
  73.       // Here we use again IGNORE to insert only one time a form_id into the table. ("form_id" is the primary key)
  74.       db_query("INSERT IGNORE INTO {crawler_forms} VALUES ('%s','%d')", $form_id, $selected_results['id']);
  75.     }
  76.   }
  77.  
  78.   /**
  79.    *  Implementation of the crawler page.
  80.    */    
  81.   function security_scanner_cron() {
  82.     //  Check if the auth session cookie value is already into the db, otherwise call
  83.     //  the function that retrieve this (enable multithreading)
  84.     if (variable_get('security_scanner_cookie','') == '') {
  85.       drupal_security_scanner_get_auth_cookie();
  86.     }
  87.     // "SELECT id,path FROM {crawler_links} WHERE crawler_id = %d AND status = 1 LIMIT 1", $crawler_id
  88.     $selection_query = array(
  89.       'fields' => array('l.id'),
  90.     );
  91.     $time = time() + 30;
  92.     while (time() < $time) {
  93.       crawler_framework($selection_query, 0, 'security_scanner_page_processing');
  94.     }
  95.   }
  96.