"; $first_time=1; /// INITIAL TO IDENTIFY FIRST TIME CRAWL $img_width=80; /// THUMBNAIL IMAGE WIDTH $total_comments=0; $total_pics=0; /********************************************************************************************************/ /* START ACTUAL WORK HERE /********************************************************************************************************/ if ($HTTP_GET_VARS['url']) $url=$HTTP_GET_VARS['url']; /// RETRIEVE SOURCE'S URL TO CRAWL elseif ($argv[1]) $url=$argv[1]; /// IF IT IS PASSED AS AN COMMAND LINE ARGUMENT else { echo "no url passed.. please indicate one news source to crawl.\n"; exit; } //ERROR $bareurl=str_replace("-","",$url); /// RETRIEVE URL NAME & REMOVE NON-ALPHANUMERICS $url=$news_index[$bareurl]; /// INDICATE STARTING ADDRESS TO CRAWL $baseurl=substr($url,0,-strlen(strrchr($url,'/'))); /// IDENTIFY THE BASE URL TO USE FOR REFERENCE if (!array_key_exists($bareurl, $url_regex)) { echo "News source (".$bareurl.") not in DB."; exit; } // SOURCE NOT IN DB /*********** if a google news source *********/ if ($bareurl=="googlenews") { for ($k=0; $k<10; $k++) /// FETCH THE FIRST TEN PAGES OF GOOGLE RESULTS WE NEED TO PARSE { $source_files[$k]=shell_exec('links -source \''.$news_index[$bareurl].strval($k*10).'\''); $source_files[$k]=preg_replace($news_block[$bareurl],'$1',$source_files[$k],1); } for ($i=0; $i<$k; $i++) $source_file=$source_file.$source_files[$i]; $google_file=preg_split('/
'.$arabic_month.'<\/p>(\d\d)
/is',$contents,$temp);
if ($temp[1]) return ($temp[1]);
$handle = fopen("months2.html", "r");
$contents = fread($handle, filesize("months.html")); fclose($handle);
$s=preg_match('/
'.$arabic_month.'<\/p>(\d\d)
/is',$contents,$temp);
return ($temp[1]);
}
/********************************************************************************************************/
/* FUNCTION TO CONVERT ENGLISH FORMATTED DATE TO SQL FORMAT
/********************************************************************************************************/
function convert_time_eng($original_t)
{
//echo "[$original_t]
\n";
preg_match('/\s*(.+?) (\d+?)\, (\d\d\d\d)/is',$original_t,$dt);
if ($dt[1] && $dt[2] && $dt[3]) { return($dt[3]."/".month_num2($dt[1])."/".$dt[2]); }
elseif (strpos($original_t,"hour"))
{
preg_match('/\s*(\d+) hour/is',$original_t,$dt);
$tm=getdate();
$tme=$tm[0]-intval($dt[1])*60*60;
return (date('Y/m/d',$tme));
}
elseif (strpos($original_t,"minute"))
{
preg_match('/\s*(\d+) minute/is',$original_t,$dt);
$tm=getdate();
$tme=$tm[0]-intval($dt[1])*60;
return (gmdate('Y/m/d',$tme));
}
}
/********************************************************************************************************/
/* FUNCTION TO CONVERT ENGLISH MONTHS TO NUMBERS
/********************************************************************************************************/
function month_num2($english_month)
{
//echo "m:$english_month
";
$english_month=substr(strtolower($english_month),0,3);
if ($english_month=="jan") return("01");
if ($english_month=="feb") return("02");
if ($english_month=="mar") return("03");
if ($english_month=="apr") return("04");
if ($english_month=="may") return("05");
if ($english_month=="jun") return("06");
if ($english_month=="jul") return("07");
if ($english_month=="aug") return("08");
if ($english_month=="sep") return("09");
if ($english_month=="oct") return("10");
if ($english_month=="nov") return("11");
if ($english_month=="dec") return("12");
}
/********************************************************************************************************/
/* FUNCTION TO CREATE AND STORE THUMBNAILS AND RETURN REFERENCE FOR DB
/********************************************************************************************************/
function store_img($b,$img,$w)
{
if (!remote_file_exists($img)) return ("");
$bin_img=shell_exec('links -source \''.addslashes($img).'\'');
//echo "[".'links -source \''.$img.'\''."]\n";
$ps=strpos($bin_img,"