1<?php 2 3// This file contains code to scrape the PHP.net homepage and 4// generate RSS news feed information out of that (included 5// in the update-backend PHP-CLI script) 6 7// Thanks to Adolfo Garcia Veytia for contributing the 8// original version of this code to php.net 9 10// Return the whole file in a string 11function getData($fname) { 12 if (!is_readable($fname)) { return false; } 13 $fp = fopen($fname, "r"); 14 if (!$fp) { return false; } 15 $raw_code = fread($fp, filesize ($fname)); 16 fclose($fp); 17 return $raw_code; 18} 19 20// Try to find a link in the text with any the used linking methods 21function scanLinks($text, $rootLink) { 22 23 if (preg_match('/<a\s+name="([^"]*)">/', $text, $matches)) { 24 $link = "#" .$matches[1]; 25 } elseif (preg_match('/<\?php\s+print_link\s*\("([^"]+)",\s*"[^"]+"\);\s*\?>/', $text, $matches)) { 26 $link = $matches[1]; 27 } elseif (preg_match('/<\?php\s+print_link\s*\(\'([^\']+)\',\s*\'[^\']+\'\);\s*\?>/', $text, $matches)) { 28 $link = $matches[1]; 29 } elseif (preg_match('/<\?php\s+print_link\s*\("([^"]+)",\s*make_image\s*\([^\)]*\)\s*\);\s*\?>/', $text, $matches)) { 30 $link = $matches[1]; 31 } elseif (preg_match('/<a\s+href="([^"]*)">/', $text, $matches)) { 32 $link = $matches[1]; 33 } 34 35 // Make sure it is a full URL 36 if (!preg_match('/^http:/', $link)) { 37 $rootLink = rtrim($rootLink, "/"); 38 $link = ($link[0] != '/') ? "$rootLink/$link" : "$rootLink$link"; 39 } 40 41 return $link; 42} 43 44// Preserve parts in the text needed and drop out everything unsusable 45function ProcessText($text) { 46 47 // Delete images, since this data will go through XML 48 $text = preg_replace('/<\?php\s+echo\s+make_image\s*\("([^"]*)",\s*"([^"]*)",\s+"([^"]*)"\);\s*\?>/i', " ", $text); 49 50 // Replace the links with <a> links (use this or the one below) 51 /* 52 $text = preg_replace('/<\?php\s+print_link\s*\("([^"]+)",\s*"([^"]+)"\);\s*\?>/', "<a href=\"$1\">$2</a>", $text); // <?php 53 $text = preg_replace('/<\?php\s+print_link\s*\(\'([^\']+)\',\s*\'([^\']+)\'\);\s*\?>/', "<a href=\"$1\">$2</a>", $text); 54 $text = preg_replace('/<\?php\s+print_link\s*\("([^"]+)",\s*make_image\s*\([^\)]*\)\s*\);\s*\?>/', "", $text); 55 */ 56 57 // Remove the hyperlink references (use this or the one above) 58 $text = preg_replace('/<\?php\s+print_link\s*\("([^"]+)",\s*"([^"]+)"\);\s*\?>/', "$2", $text); 59 $text = preg_replace('/<\?php\s+print_link\s*\(\'([^\']+)\',\s*\'([^\']+)\'\);\s*\?>/', "$2", $text); 60 $text = preg_replace('/<\?php\s+print_link\s*\("([^"]+)",\s*make_image\s*\([^\)]*\)\s*\);\s*\?>/', "", $text); 61 62 // Drop HTML, trim string and drop multiple spaces 63 $text = trim(strip_tags($text)); 64 return preg_replace("!\\s+!", " ", $text); 65} 66 67// Parse the index file searching for news item information 68function ParseNews ($index_page = "", $aboutLink) { 69 70 // Remove commented items 71 //$index_page = preg_replace("/<!--[\w\W]*?-->/", "", $index_page); 72 73 // Split the file by newlines 74 $lines = preg_split("/\n/", $index_page); 75 #DEBUG# print "<pre>"; print_r($lines); print "</pre>"; 76 77 // Define month conversion hash 78 $mos = [ 79 "Jan" => 1, "Feb" => 2, "Mar" => 3, "Apr" => 4, "May" => 5, "Jun" => 6, 80 "Jul" => 7, "Aug" => 8, "Sep" => 9, "Oct" => 10, "Nov" => 11, "Dec" => 12 81 ]; 82 83 // We have not started to parse the 84 // news and we have no headlines right here 85 $news_started = FALSE; 86 $headlineid = 0; 87 88 // Try to classify every line backed with state information 89 // and patterns to recognize for news item elements 90 foreach ($lines as $i => $line) { 91 92 // We are not in a news item 93 if (!$news_started) { 94 95 // If we found theis comment, then we are at the right place 96 if (strpos($line, "DO NOT REMOVE THIS COMMENT")) { $news_started = TRUE; } 97 else { continue; } 98 99 // We are in a news item 100 } elseif ($news_started) { 101 102 // Headline separator reached 103 if (preg_match('!<hr />!', $line)) { 104 105 $headlineid++; 106 #DEBUG# print "<p>Info: New Headline: $headlineid<br />"; 107 108 // End of headlines reached 109 } elseif (preg_match('@<a href="/archive/index.php">News Archive</a>@', $line) || strpos($line, "// NO MORE NEWS TO PARSE") === 0) { 110 111 array_pop($headlines); 112 break; 113 114 // The headline title is in <h1> tags [it needs to be be on line line!] 115 } elseif (preg_match('/<h1>(.*)<\/h1>/i', $line, $matches)) { 116 117 $headlines[$headlineid]['title'] = "$matches[1]"; 118 #DEBUG# print "Title: $matches[1]<br />"; 119 120 // Dates are below the headline title 121 } elseif (preg_match('/<span.*>\[(\d+)-(\S*)-(\d+)\]<\/span>/', $line, $matches)) { 122 123 $headlines[$headlineid]['date'] = mktime(1,1,1, $mos[$matches[2]], $matches[1], $matches[3]); 124 #DEBUG# print "Date: $matches[1] $matches[2] $matches[3]<br />"; 125 126 // Subjects (i.e RDF category) 127 } elseif (preg_match("/<!-- SUBJECT: (.*?) -->/", $line, $matches)) { 128 $headlines[$headlineid]['subject'] = $matches[1]; 129 130 // Everything else is part of the headline text 131 } else { 132 if (!preg_match('/^\s*\?>\s*$/', $line)) { 133 if (isset($headlines[$headlineid]['text'])) { 134 $headlines[$headlineid]['text'] .= " $line"; 135 } else { 136 $headlines[$headlineid]['text'] = " $line"; 137 } 138 } 139 } 140 } 141 } 142 143 // Cycle through the headlines 144 foreach ($headlines as $num => $headline) { 145 146 // The first link found is THE link for the news item 147 $headlines[$num]['link'] = scanLinks($headline['text'], $aboutLink); 148 149 // And the text needs to be cleaned up 150 $headlines[$num]['text'] = ProcessText($headline['text']); 151 152 // And date needs to be reformatted 153 $headlines[$num]['date'] = date("Y-m-d", $headline['date']); 154 } 155 156 return $headlines; 157} 158 159// Generate RSS header text and inject it into $RSS 160function GenerateRSSHeader($headlines, &$RSS, $aboutLink) { 161 $RSS .= "<" . "?xml version=\"1.0\" encoding=\"utf-8\"?>\n" . 162 "<rdf:RDF\n" . 163 "\txmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"\n" . 164 "\txmlns=\"http://purl.org/rss/1.0/\"\n" . 165 "\txmlns:dc=\"http://purl.org/dc/elements/1.1/\"\n" . 166 ">\n" . 167 "<channel rdf:about=\"$aboutLink\">\n" . 168 "\t<title>PHP: Hypertext Preprocessor</title>\n" . 169 "\t<link>$aboutLink</link>\n" . 170 "\t<description>The PHP scripting language web site</description>\n" . 171 "\t<items>\n" . 172 "\t\t<rdf:Seq>\n"; 173 174 // Cycle through all the Resources on the RSS 175 foreach ((array)$headlines as $headline) { 176 $RSS .= "\t\t\t<rdf:li rdf:resource=\"" . $headline['link'] . "\" />\n"; 177 } 178 179 $RSS .= "\t\t</rdf:Seq>\n\t</items>\n</channel>\n"; 180} 181 182// Add RSS footer information to $RSS 183function GenerateRSSFooter(&$RSS) { 184 $RSS .= "</rdf:RDF>\n"; 185} 186 187// Add an RSS item's information to $RSS 188function GenerateRSSItem($href, $title, $text, $date, $subject, &$RSS) { 189 if($subject) { 190 $s = "\t<dc:subject>$subject</dc:subject>\n"; 191 } else { 192 $s = ""; 193 } 194 $RSS .= "\n<item rdf:about=\"$href\">\n" . 195 "\t<title>$title</title>\n" . 196 "\t<link>$href</link>\n" . 197 $s . 198 "\t<description>$text</description>\n" . 199 "\t<dc:date>" . $date . "</dc:date>\n" . 200 "</item>\n"; 201} 202 203function GenerateRSSFile($root, $aboutLink) { 204// Get the PHP.net index page's source code 205$homepage = getData("$root/index.php"); 206 207// This returns a data structure containing all the news items found 208$hlines = ParseNews($homepage, $aboutLink); 209 210// Start with an empty RSS string 211$RSS = ''; 212 213// Generate the RSS Header 214GenerateRSSHeader($hlines, $RSS, $aboutLink); 215 216// Add separator comment 217$RSS .= "<!-- RSS-Items -->\n"; 218 219// Add every news item to the feed 220foreach ($hlines as $hline) { 221 GenerateRSSItem($hline['link'], $hline['title'], $hline['text'], $hline['date'], isset($hline['subject']) ? $hline['subject'] : false, $RSS); 222} 223 224// Add end separator 225$RSS .= "<!-- / RSS-Items PHP/RSS -->\n"; 226 227// Dump the last XML tag 228GenerateRSSFooter($RSS); 229 230#DEBUG# echo $RSS; 231return $RSS; 232} 233 234#$RSSNews = GenerateRSSFile($root, "http://php.net/"); 235#$RSSConf = GenerateRSSFile("$root/conferences", "http://php.net/conferences/"); 236 237?> 238