"; $first_time=1; /// INITIAL TO IDENTIFY FIRST TIME CRAWL $img_width=80; /// THUMBNAIL IMAGE WIDTH $total_comments=0; $total_pics=0; /********************************************************************************************************/ /* START ACTUAL WORK HERE /********************************************************************************************************/ if ($HTTP_GET_VARS['url']) $url=$HTTP_GET_VARS['url']; /// RETRIEVE SOURCE'S URL TO CRAWL elseif ($argv[1]) $url=$argv[1]; /// IF IT IS PASSED AS AN COMMAND LINE ARGUMENT else { echo "no url passed.. please indicate one news source to crawl.\n"; exit; } //ERROR $bareurl=str_replace("-","",$url); /// RETRIEVE URL NAME & REMOVE NON-ALPHANUMERICS $url=$news_index[$bareurl]; /// INDICATE STARTING ADDRESS TO CRAWL $baseurl=substr($url,0,-strlen(strrchr($url,'/'))); /// IDENTIFY THE BASE URL TO USE FOR REFERENCE if (!array_key_exists($bareurl, $url_regex)) { echo "News source (".$bareurl.") not in DB."; exit; } // SOURCE NOT IN DB /*********** if a google news source *********/ if ($bareurl=="googlenews") { for ($k=0; $k<10; $k++) /// FETCH THE FIRST TEN PAGES OF GOOGLE RESULTS WE NEED TO PARSE { $source_files[$k]=shell_exec('links -source \''.$news_index[$bareurl].strval($k*10).'\''); $source_files[$k]=preg_replace($news_block[$bareurl],'$1',$source_files[$k],1); } for ($i=0; $i<$k; $i++) $source_file=$source_file.$source_files[$i]; $google_file=preg_split('//si',$source_file); array_shift($google_file); //print_r($google_file); // array_shift($google_file); } /*********** if OTHER individual news sources *********/ else { $source_file=shell_exec("links -source '".$news_index[$bareurl]."'"); //echo $news_index[$bareurl]." ($source_file)
(".$news_block[$bareurl].")"; exit; if ($HTTP_GET_VARS['t4']) { echo "($source_file)
"; exit; } $source_file=preg_replace($news_block[$bareurl],'$1',$source_file,1); /// FETCH THE BODY WE NEED TO PARSE if ($bareurl=="aladhwaa") $source_file=preg_replace('/.+/is','',$source_file); if ($HTTP_GET_VARS['t3']) { echo "($source_file)
"; exit; } //echo "[".$url_regex[$bareurl]."]\n"; //echo "($source_file)
"; exit; } $source_file=str_replace("$baseurl/","",$source_file); // FIND THE BASE URL TO USE LATER $source_file=str_replace($baseurl,"",$source_file); $baseurl=$baseurl."/"; /*********** IDENTIFY ALL LINKS TO STORIES *********/ preg_match_all($url_regex[$bareurl],$source_file,$u); /***************************************************/ //echo "(urex:".$url_regex[$bareurl]." file:($source_file)"; /******* SOME EXCEPTIONS FOR A FEW SOURCES ******/ if ($bareurl=="aleshteraki") { for ($l=0;$l"\'\s]*?ac=0/si','',$u[1][$l]); } $u=array_unique($u[1]); /// REMOVE DUPLICATE STORIES /**** DELETE ENTRIES FROM GOOGLE NEWS THAT ARE FROM SOURCES WE PARSED INDIVIDUALLY **/ if ($bareurl=="googlenews") { $tot=sizeof($u); for ($ind=0; $ind<$tot; $ind++) { if (news_source_exists($u[$ind],$url_regex)) { //echo "exists (${u[$ind]})
\n"; for ($n=$ind; $n<$tot; $n++) { $u[$n]=$u[$n+1]; $google_file[$n]=$google_file[$n+1]; } array_pop($u); $ind--; array_pop($google_file); } } $google_file=array_values($google_file); } $u=array_values(array_unique($u)); $max=40; if ($bareurl=="googlenews") { $google_file=array_slice($google_file,0,$max); $google_file=array_reverse($google_file); } $u=array_slice($u,0,$max); $u=array_reverse($u); if ($HTTP_GET_VARS['t1']) { print_r($u); exit; } //******************************************************************************************************** //* START LOOP******************************************************************************************** //******************************************************************************************************** if (sizeof($u)<$max) $max=sizeof($u); //$u[$max-1]="index.php?option=com_content&task=view&id=97&Itemid=9"; if ($HTTP_GET_VARS['t2']) $temp_m=$max-1; else $temp_m=0; for ($i=$temp_m; $i<$max; $i++) //for ($i=0; $i<1; $i++) { unset($title,$date,$link,$updated,$writer,$image,$story,$comments,$c[1]); $skip_w=0; if ($bareurl=="yemenhurr") { preg_match('/\s*(.+?)\s*
/si',$u[$i],$date); preg_match('/ href="?(modules\.php\?name=News\&[^">\s]*file=article\&[^">\s]*sid=\d+)/si',$u[$i],$t); $u[$i]=$t[1]; } //echo "u$i:(".$u[$i]." d:${date[1]})"; //continue; if (strlen($u[$i])<2) continue; if (substr($u[$i],0,7)!="http://") $u[$i]=$baseurl.$u[$i]; /// ensure links are complete $u[$i]=urldecode($u[$i]); $u[$i]=html_entity_decode($u[$i]); /// decode if needed //echo "(u$i):".$u[$i]."
"; continue; if ($first_time) /// first loop entry (executed once only) { //******* CONNECT TO DATABASE ****** if (!$test) { mysql_connect("mysql7.websitesource.net","investo_yportal","yemenportal") or die( "no connection"); mysql_select_db("investo_yp") or die ("no select"); $table_exists=mysql_num_rows( mysql_query("SHOW TABLES LIKE 'ALL_STORIES'")); } else echo "
[Connected to DB]
"; //******* create table for source (and delete old one in test mode) ****** if ($prune || !$table_exists) { if ($table_exists) { echo "rows of ($bareurl) from ALL_STORIES dropped...
"; if (!$test) mysql_query('DELETE FROM ALL_STORIES WHERE source=\''.$bareurl.'\''); } $query="CREATE TABLE IF NOT EXISTS STATS (updated TIMESTAMP, total INT, google INT, ". "alexa1 INT, alexa2 TINYINT,daily_hits INT,visitors INT, files INT, total_hits INT)"; if (!$test) mysql_query($query)or die("Create table Error: ".mysql_error()); /// apply the query else echo "
[$query]
"; $query='CREATE TABLE IF NOT EXISTS SOURCES (source VARCHAR(50), latest_date DATE,'. 'updated TIMESTAMP, total INT, views INT, votes INT, comment_no INT, pic INT, last_ip BIGINT, PRIMARY KEY(source))'; if (!$test) mysql_query($query)or die("Create table Error: ".mysql_error()); /// apply the query else echo "
[$query]
"; if (!$test) mysql_query('ESCAPE \'\\\''); /// use escape figure echo "rows for ($bareurl just created.
"; /// some notification $query= <<[$query]
"; if (!$test) { $rs=mysql_query("SHOW INDEX FROM ALL_STORIES"); $r=mysql_fetch_array($rs); $r=mysql_fetch_array($rs); if ($r[2]!="ALL_STORIES_ix") mysql_query('CREATE FULLTEXT INDEX ALL_STORIES_ix ON ALL_STORIES(title,story,comments)')or die(mysql_error()); } else echo "
[CREATE FULLTEXT INDEX ALL_STORIES_ix ON ALL_STORIES(title,story,comment_no)]
"; } // end test mode $first_time=0; $s=""; $latest_date="01-01-2000"; } // end first time condition //echo "here1"; continue; if (!$test && (!$skip || !$comments_frame[$bareurl])&& exists($u[$i])) continue; // skip if link exists in db $img=""; $pic=0; $whole_file=""; $comments_str=""; /// reset variables we will use in loop //*********** stories for google are retrieved differently *********** if ($bareurl=="googlenews" || $bareurl=="googlenews_eng") $whole_file=$google_file[$i]; else $whole_file=shell_exec('links -source \''.$u[$i].'\''); // else retrieve it from the WWW //echo "[".$main_block[$bareurl]."]($whole_file)"; continue; //echo "here1"; continue; //****** First retrieve some information (e.g. date) needed for some sources ****** if ($bareurl=="alayyam") { $date[1]=preg_replace('/.+\s*(\d+)\/(\d+)\/(\d\d\d\d)\s.+/si', '$3'.'/'.'$2'.'/'.'$1',$whole_file); $whole_file=preg_replace($main_block[$bareurl],'$1',$whole_file); /* $whole_file=preg_replace('/.+?<\/span>/si','',$whole_file);*/ $whole_file=preg_replace('/<\s*a\s.+?href.+?>.+?<\/\s*a\s*>/si','',$whole_file); } elseif ($bareurl=="14october") { $date[1]=preg_replace('/.+'. '.+?\,\s*(.+?)<\/font><\/b><\/span>.+/si','$1',$whole_file); $whole_file=preg_replace($main_block[$bareurl],'$1',$whole_file); } elseif ($bareurl=="sabanews") { $date[1]=preg_replace('/.+\s*(\d+\/\d+\/\d+)\s*.+/si','$1',$whole_file); $whole_file=preg_replace($main_block[$bareurl],'$1',$whole_file); } elseif ($bareurl=="alwahdah") { $title[1]=preg_replace('/.+20\; font\-weight\:bold\;\">(.+?)gmdate('Y') || ($dd[2]>gmdate('m') && $dd[1]>=gmdate('Y')) || ($dd[3]>gmdate('d') && $dd[2]>=gmdate('m') && $dd[1]>=gmdate('Y')) || $dd[1]==0 || $dd[2]==0 || $dd[3]==0) $date[1]=gmdate('Y').'/'.gmdate('m').'/'.gmdate('d'); //echo "d2:[${date[1]}]";// exit; //****** Retrieve the writer of the story ****** preg_match($writer_frame[$bareurl],$whole_file,$writer); //****** Retrieve the main image of the story (if available) ****** preg_match($image_frame[$bareurl],$whole_file,$image); //echo "(".$image_frame[$bareurl].") (${image[1]})
($whole_file)"; //***** some exceptions ******* if ($bareurl=="todaynews") $image[1]=str_replace("todaynews/","",$image[1]); //****** Retrieve the story text ****** preg_match($story_frame[$bareurl],$whole_file,$story); //****** Retrieve the comments if available (per comment) ****** preg_match_all($comments_frame[$bareurl],$whole_file,$c, PREG_PATTERN_ORDER); //****** Calculate the time the story was retrieved ****** $updated=gmdate("Y-m-d H:i:")."0"; //****** Do some formatting for retrieved values ****** $title[1]=preg_replace('//is'," ... ",$title[1]); $title[1]=preg_replace('/.+?<\/a>/is',"",$title[1]); $title[1]=preg_replace('/\ \;/si',' ',$title[1]); $writer[1]=preg_replace('/.+/is',"\n",$story[1]); if ($bareurl=="yemenobserver") { $temp_w=preg_quote($writer[1]); $story[1]=preg_replace('/'.$temp_w.'/','',$story[1]); } if ($image[1]&& substr($image[1],0,7)!="http://") { if(strpos($image[1],'/')===0) { $temp=preg_replace('/(http:\/\/.+?)\/.+/i','$1',$baseurl); if (strrpos($temp,'/')) $temp = substr_replace($temp,"",-1); //echo "(t:$temp)"; $image[1]=$temp.$image[1]; } else $image[1]=$baseurl.$image[1]; } $image[1]=str_replace("news.google.com/","",$image[1]); if ($image[1]) $img=store_img($bareurl,$image[1],$img_width); if ($img) $pic=1; else $pic=0; //***** Strip tags from retrieved fields ***** $title[1]=strip_tags($title[1]); $writer[1]=strip_tags($writer[1]); $story[1]=strip_tags($story[1]); $comments_total=sizeof($c[1]); for ($j=0; $j<$comments_total; $j++) /// format comments { $c[1][$j]=preg_replace('//is',"\n",$c[1][$j]); $c[1][$j]=strip_tags($c[1][$j]); $comments_str=$comments_str."##comment##".$c[1][$j]; } $total_comments=$total_comments+$comments_total; //***************** convert from Windows-1256 to UTF-8 ************** if ($bareurl!="althawranews" && $bareurl!="googlenews" && $bareurl!="googlenews_eng") { $title[1]=iconv("WINDOWS-1256","UTF-8",$title[1]); if ($skip_w==0) $writer[1]=iconv("WINDOWS-1256","UTF-8",$writer[1]); $story[1]=iconv("WINDOWS-1256","UTF-8",$story[1]); $comments_str=iconv("WINDOWS-1256","UTF-8",$comments_str); } //***** Obtained fields need to be placed in string for later delivery ***** if ($bareurl!="googlenews" && $bareurl!="googlenews_eng" && strlen($writer[1])<5 || strlen($writer[1])>1000) $writer[1]=$crawler_sources[$bareurl]['name']; $s=$s. <<
link:${u[$i]}
title:${title[1]}
date:${date[1]}
updated:$updated
writer:${writer[1]} RESULT; if ($image[1] && $img) /// there is an image associated { $s=$s. <<
local image: location:$img
remote image:${image[1]} RESULT; $total_pics=$total_pics+1; } else $s=$s."\n
image:NO IMAGE"; $s=$s. <<
story:${story[1]}
comments: RESULT; $s= $s . str_replace('##comment##','

',$comments_str)."\n"; $s=$s . "
Position on page:".strval($max-$i)."
\n"; //***** INSERT INTO THE DATABASE ****** if ($title[1] && (strlen($story[1])>500 || (strlen($story[1])>10 && $bareurl=="googlenews_eng"))) { $query='INSERT INTO ALL_STORIES VALUES (\''.$bareurl.'\', '.$crawler_sources[$bareurl]['affiliation'].', '. $crawler_sources[$bareurl]['paper'].', '.$crawler_sources[$bareurl]['daily'].', \''.$u[$i].'\', \''.addslashes($title[1]). '\', \''.$date[1].'\', \''.$updated.'\', \''.addslashes($writer[1]).'\', \''.$img.'\', \''. addslashes($image[1]).'\', \''.addslashes($story[1]).'\', \''.addslashes($comments_str).'\', '. $comments_total.', '.$pic.', 0, 0, 0) ON DUPLICATE KEY UPDATE comments=\''.addslashes($comments_str). '\', comment_no='.$comments_total; //echo "[$query]"; exit; if (!$test) mysql_query($query)or die("q: $query - AL Insert table Error: ".mysql_error()); else echo "
[$query]
"; $s=$s."Result:(".$u[$i].") inserted successfully

"; //echo "date: ${date[1]} - d_srttot:[".strtotime($date[1])."] ld:[$latest_date] ts_ld:[".strtotime($latest_date)."])
"; if (strtotime($date[1])>strtotime($latest_date)) $latest_date=$date[1]; $count++; // COUNT THE NUMBER OF INSERTED ROWS //echo "$i:(d:[${date[1]}] ld:[$latest_date] ts_ld:[".strtotime($latest_date)."])
"; } else $s=$s."Result:(".$u[$i].") failed to be inserted

"; if ($verbose) { echo $s; $s=""; } /// IF THIS IS TEST MODE, CLEAR THE STRING AND JUST DISPLAY } /// END OF LINKS LOOP //echo "($count - $updated)"; if ($count>0 && $updated) { $s=$s."
Done updating all $count records for ($bareurl) at:".gmdate("F j, Y, g:i a")."...
"; if (!$test) $s=$s."
Total number of stories in DB:".update_total($bareurl,$latest_date,$updated,$count)."
"; } else echo "No new articles for ($bareurl)"; if (!$test) mysql_close(); /// CLOSE THE DB else echo "
[mysql_close]
"; if ($verbose) echo $s; /// IF THIS IS A TEST, JUST DISPLAY else /// ELSE SEND AN EMAIL TO THE WEBMASTER WITH THE DETAILS { if ($count) send_update($bareurl,"New update:".gmdate("F j, Y, g:i a")."\n
----
\n
\n$s",$count); else echo "No news for ($bareurl) at (".gmdate("F j, Y, g:i a").")
\n"; } /********************************************************************************************************/ /* START FUNCTIONS USED ABOVE /********************************************************************************************************/ /*******FUNCTION TO CHECK IF A STORY ALREADY EXISTS IN THE DB *******************************************/ function exists($u) { $q=mysql_query("SELECT COUNT(*) FROM 'ALL_STORIES' WHERE url='$u'"); $q=mysql_fetch_row($q); if ($q[0]===0) return(false); return(true); } /********************************************************************************************************/ /*******FUNCTION TO CHECK IF A STORY ALREADY EXISTS IN THE DB *******************************************/ function update_total($s,$ld,$u,$tot) { global $total_comments,$total_pics, $prune,$crawler_sources; $total=0; //print_r($list); $out = shell_exec("links -source 'http://client.alexa.com/common/css/scramble.css'"); preg_match_all('/\.([A-Z0-9]+)\s*{\s*display\s*:\s*none[\s;]*}/si', $out, $matches); $out = shell_exec("links -source 'http://alexa.com/data/details/traffic_details?url=yemenportal.net'"); preg_match('/Traffic Rank for \s*yemenportal\.net\:<\/span>\s*\ \;<\!.+?>(.+?)<\/span>
/si',$out,$a1); $alexa1=$a1[1]; foreach ($matches[1] as $s1) $alexa1 = preg_replace("/\d+<\/span>/si", "", $alexa1); $alexa1 = strip_tags($alexa1); preg_match('/traffic rank in other countries.+?-->(.+?)<\/div>/si',$out,$a2); $alexa2=$a2[1]; foreach ($matches[1] as $s1) $alexa2 = preg_replace("/\d+<\/span>/si", "", $alexa2); $alexa2 = strip_tags($alexa2); $google=shell_exec("links -source 'http://google.com/search?q=yemenportal.net'"); preg_match('/ of about ([\d\,]+)<\/b> for yemenportal/si',$google,$g); $google=$g[1]; $webalizer=shell_exec("links -source 'http://yemenportal.net/webalizer/index.html'"); if (strlen($webalizer)<5000) return; preg_match('/<\/FONT><\/A><\/TD>.+?>(\d+)<.+?>Totals<\/FONT><\/TH>.+?>\d+<.+?>(\d+)<.+?>\d+<.+?>(\d+)<.+?>(\d+)100000) { $handle = fopen("i.txt","w"); fwrite($handle, $out); fclose($handle); } else { $out = shell_exec("links -source 'http://yemenportal.net/index2.php'"); if (strlen($out)>100000) { $handle = fopen("i.txt","w"); fwrite($handle, $out); fclose($handle); } } return $total; } /********************************************************************************************************/ /* FUNCTION TO SEND THE UPDATE BY EMAIL TO WEBMASTER /********************************************************************************************************/ function send_update($b,$s,$n) { if (!$ml) { echo "Successfully updated ($b) with ($n) new articles\n"; return; } $to = 'wsaqaf@gmail.com'; // note the comma $subject = gmdate('Y/m/d')." update: $n new articles inserted to ($b) on Yemen Portal"; // To send HTML mail, the Content-type header must be set $headers = 'MIME-Version: 1.0' . "\r\n"; $headers .= 'Content-type: text/html; charset=utf-8' . "\r\n"; // Additional headers $headers .= 'To: Walid Al-Saqaf ' . "\r\n"; $headers .= 'From: Yemen Portal BOT ' . "\r\n"; mail($to, $subject, $s, $headers); echo "Update ($n new stories) for ($b) emailed at:".gmdate("F j, Y, g:i a")."
\n"; } /********************************************************************************************************/ /* FUNCTION TO CONVERT ARABIC FORMATTED DATE TO SQL DATE FORMAT /********************************************************************************************************/ function convert_time($original_t) { $handle = fopen("times.html", "r"); $d=fgets($handle); $d=rtrim($d); $h=fgets($handle); $h=rtrim($h); $m=fgets($handle); $m=rtrim($m); fclose($handle); //echo "origin:$original_t paran:[$d]".strpos($original_t,"("); if (strpos($original_t,"(")) { //preg_match('/(\d+) .+? \((.+?)\)(\d\d\d\d)/is',$original_t,$dt); preg_match('/(\d+?) .+? \((.+?)\) (\d\d\d\d)/is',$original_t,$dt); //echo $dt[1]."
"; echo $dt[2]."
"; echo $dt[3]."
"; echo $dt[4]."
"; return($dt[3]."/".month_num($dt[2])."/".$dt[1]); } elseif (strpos($original_t,$h)) { preg_match('/.+? (\d+) /is',$original_t,$dt); $tm=getdate(); $tme=$tm[0]-intval($dt[1])*60*60; return (date('Y/m/d',$tme)); } elseif (strpos($original_t,$m)) { preg_match('/.+? (\d+) /is',$original_t,$dt); $tm=getdate(); $tme=$tm[0]-intval($dt[1])*60; return (gmdate('Y/m/d',$tme)); } } /********************************************************************************************************/ /* FUNCTION TO CONVERT ENGLISH FORMATTED DATE TO SQL DATE FORMAT /********************************************************************************************************/ function conv_date_eng($d) { //echo "1:$d
"; preg_match('/([a-zA-Z]+)/',$d,$dt); $dt2=month_num2($dt[1]); //echo "2:$dt2
"; $d=preg_replace("/".$dt[1]."/i",$dt2,$d); return ($d); } /********************************************************************************************************/ /* ANOTHER FUNCTION TO CONVERT ARABIC FORMATTED DATE TO SQL DATE FORMAT /********************************************************************************************************/ function convert_all_dates($dt) { //$dt="12 May 2005"; preg_match('/([^a-zA-Z\s0-9\s\-\/\\\.]{4,10})/',$dt,$month); //echo "(${month[1]})"; if ($month[1]) { $m=month_num($month[1]); //echo "(m:$m)"; } else { preg_match('/([a-zA-Z]{3,10})/',$dt,$month); //echo "(${month[1]})"; if ($month[1]) $m=month_num2($month[1]); } if ($m) { preg_match('/(\d\d\d\d)/',$dt,$y); //echo "[[[$y[1]]]"; if ($y[1]) { $dt=str_replace($y[1],"",$dt); preg_match('/(\d+)/',$dt,$d); if ($d[1]) return("${y[1]}-$m-${d[1]}"); } } $dt=preg_replace('/.*?(\d+).+?(\d+).+?(\d+).*/','$1'.'-'.'$2'.'-'.'$3',$dt); list($M,$D,$Y)=split('[\/.-]',$dt); //echo "[[$M $D $Y]]"; if(strlen($D)==1) $D='0'.$D; if(strlen($M)==1) $M='0'.$M; if(strlen($Y)==1) $Y='0'.$Y; if (strlen($D)>2) $dt=$M."-".$D."-".$Y; elseif(strlen($Y)>2) $dt=$Y."-".$D."-".$M; return($dt); } /********************************************************************************************************/ /* FUNCTION TO CONVERT ARABIC MONTHS TO NUMBERS /********************************************************************************************************/ function month_num($arabic_month) { // print "month ($arabic_month)
"; $handle = fopen("months.html", "r"); $contents = fread($handle, filesize("months.html")); fclose($handle); $s=preg_match('/

'.$arabic_month.'<\/p>(\d\d)
/is',$contents,$temp); if ($temp[1]) return ($temp[1]); $handle = fopen("months2.html", "r"); $contents = fread($handle, filesize("months.html")); fclose($handle); $s=preg_match('/

'.$arabic_month.'<\/p>(\d\d)
/is',$contents,$temp); return ($temp[1]); } /********************************************************************************************************/ /* FUNCTION TO CONVERT ENGLISH FORMATTED DATE TO SQL FORMAT /********************************************************************************************************/ function convert_time_eng($original_t) { //echo "[$original_t]
\n"; preg_match('/\s*(.+?) (\d+?)\, (\d\d\d\d)/is',$original_t,$dt); if ($dt[1] && $dt[2] && $dt[3]) { return($dt[3]."/".month_num2($dt[1])."/".$dt[2]); } elseif (strpos($original_t,"hour")) { preg_match('/\s*(\d+) hour/is',$original_t,$dt); $tm=getdate(); $tme=$tm[0]-intval($dt[1])*60*60; return (date('Y/m/d',$tme)); } elseif (strpos($original_t,"minute")) { preg_match('/\s*(\d+) minute/is',$original_t,$dt); $tm=getdate(); $tme=$tm[0]-intval($dt[1])*60; return (gmdate('Y/m/d',$tme)); } } /********************************************************************************************************/ /* FUNCTION TO CONVERT ENGLISH MONTHS TO NUMBERS /********************************************************************************************************/ function month_num2($english_month) { //echo "m:$english_month
"; $english_month=substr(strtolower($english_month),0,3); if ($english_month=="jan") return("01"); if ($english_month=="feb") return("02"); if ($english_month=="mar") return("03"); if ($english_month=="apr") return("04"); if ($english_month=="may") return("05"); if ($english_month=="jun") return("06"); if ($english_month=="jul") return("07"); if ($english_month=="aug") return("08"); if ($english_month=="sep") return("09"); if ($english_month=="oct") return("10"); if ($english_month=="nov") return("11"); if ($english_month=="dec") return("12"); } /********************************************************************************************************/ /* FUNCTION TO CREATE AND STORE THUMBNAILS AND RETURN REFERENCE FOR DB /********************************************************************************************************/ function store_img($b,$img,$w) { if (!remote_file_exists($img)) return (""); $bin_img=shell_exec('links -source \''.addslashes($img).'\''); //echo "[".'links -source \''.$img.'\''."]\n"; $ps=strpos($bin_img,""); if (!$ps) $ps=strpos($bin_img,""); if ($ps===true) return (""); $handle = fopen("images/temp.pic","w"); fwrite($handle, $bin_img); fclose($handle); $loc=strrpos($img,"/"); if ($loc) $l=substr($img,$loc+1,strlen($img)-1); if (!$l) return (""); $l=preg_replace("/[^a-zA-Z0-9\_\-\.]/", "", $l); $x=@getimagesize("images/temp.pic"); $fl=file_exists('images/'.$b.'_'.$l); if (($x[0]>$x[1] && $x[0]<=100)||($x[1]>=$x[0] && $x[1]<=100)) shell_exec('cp -f images/temp.pic \'images/'.$b.'_'.$l.'\''); else { if ($x[0]>$x[1] && !$fl) $res=shell_exec('/usr/local/bin/convert images/temp.pic -thumbnail '.$w.'x -bordercolor black -border 1 \'images/'.$b.'_'.$l.'\''); elseif ($x[0]<=$x[1] && !$fl) $res=shell_exec('/usr/local/bin/convert images/temp.pic -thumbnail x'.$w.' -bordercolor black -border 1 \'images/'.$b.'_'.$l.'\''); } shell_exec('rm -f images/temp.pic'); if (remote_file_exists("http://yemenportal.net/images/".$b."_".$l)) return ("http://yemenportal.net/images/".$b."_".$l); return (""); } /********************************************************************************************************/ /* FUNCTION TO CHECK IF A SPECIFIC SOURCE EXISTS IN DB OR NOT /********************************************************************************************************/ function news_source_exists($l,$list) { //echo "
l0:$l
"; $l=str_replace('://','',$l); $l=substr($l,0,strpos($l,'/')); $l=str_replace('-','',$l); $ks=array_keys($list); //echo "
l1:$l
"; for ($k=0; $k\n"; preg_match('/'.$ks[$k].'\./i',$l,$res); if ($res[0]) { //echo "
found ${ks[$k]} in $l
"; return (true); } // else echo "${ks[$k]} is not in $l
\n"; } return (false); } /********************************************************************************************************/ /* FUNCTION TO CHECK IF AN URL FOR AN IMAGE EXISTS OR NOT /********************************************************************************************************/ function remote_file_exists($url) { $head = ''; $url_p = parse_url ($url); if (isset ($url_p['host'])) { $host = $url_p['host']; } else { return '1 Invalid URL host'; } if (isset ($url_p['path'])) { $path = $url_p['path']; } else { $path = ''; } $fp = fsockopen ($host, 80, $errno, $errstr, 20); if (!$fp) { return '2 Unable to connect to remote host'; } else { $parse = parse_url($url); $host = $parse['host']; fputs($fp, 'HEAD '.$url." HTTP/1.1\r\n"); fputs($fp, 'HOST: '.$host."\r\n"); fputs($fp, "Connection: close\r\n\r\n"); $headers = ''; while (!feof ($fp)) { $headers .= fgets ($fp, 128); } } fclose ($fp); // for debug //echo nl2br($headers); $arr_headers = explode("\n", $headers); if (isset ($arr_headers[0])) { if(strpos ($arr_headers[0], '200') !== false) { return true; } if( (strpos ($arr_headers[0], '404') !== false) || (strpos ($arr_headers[0], '410') !== false)) { return false; } if( (strpos ($arr_headers[0], '301') !== false) || (strpos ($arr_headers[0], '302') !== false)) { preg_match("/Location:\s*(.+)\r/i", $headers, $matches); if(!isset($matches[1])) return false; $nextloc = $matches[1]; return remote_file_exists($nextloc); } } preg_match('/HTTP.*(\d\d\d.*)\r/i', $headers, $matches); return '3 Status Code not supported'. (isset($matches[1])?": $matches[1]":''); } /********************************************************************************************************/ /* END OF ALL FUNCTIONS AND BODY OF PHP SCRIPT /********************************************************************************************************/ /* mysql_connect("mysql7.websitesource.net","investo_yportal","yemenportal") or die( "no connection"); mysql_select_db("investo_yp") or die ("no select"); $arr=array_keys($crawler_sources); foreach ($arr as $s) { $result=mysql_query("SELECT SUM(views) FROM ALL_STORIES where source='$s'"); $result=mysql_fetch_row($result); $sums[$s]=$result[0]; mysql_query("UPDATE SOURCES SET views=views+${result[0]} WHERE source='$s'"); echo "source:$s total views:${result[0]}
"; } foreach ($arr as $s) { $result=mysql_query("SELECT SUM(comment_no) FROM ALL_STORIES where source='$s'"); $result=mysql_fetch_row($result); $sums[$s]=$result[0]; mysql_query("UPDATE SOURCES SET comment_no=comment_no+${result[0]} WHERE source='$s'"); echo "source:$s total comment_no:${result[0]}
"; } foreach ($arr as $s) { $result=mysql_query("SELECT SUM(pic) FROM ALL_STORIES where source='$s'"); $result=mysql_fetch_row($result); $sums[$s]=$result[0]; mysql_query("UPDATE SOURCES SET pic=pic+${result[0]} WHERE source='$s'"); echo "source:$s total pics:${result[0]}
"; } */ ?>