xref: /web-master/scripts/rss_parser (revision f17c55f1)
1<?php
2
3// This file contains code to scrape the PHP.net homepage and
4// generate RSS news feed information out of that (included
5// in the update-backend PHP-CLI script)
6
7// Thanks to Adolfo Garcia Veytia for contributing the
8// original version of this code to php.net
9
10// Return the whole file in a string
11function getData($fname) {
12    if (!is_readable($fname)) { return false; }
13    $fp = fopen($fname, "r");
14    if (!$fp) { return false; }
15    $raw_code = fread($fp, filesize ($fname));
16    fclose($fp);
17    return $raw_code;
18}
19
20// Try to find a link in the text with any the used linking methods
21function scanLinks($text, $rootLink) {
22
23    if (preg_match('/<a\s+name="([^"]*)">/', $text, $matches)) {
24        $link = "#" .$matches[1];
25    } elseif (preg_match('/<\?php\s+print_link\s*\("([^"]+)",\s*"[^"]+"\);\s*\?>/', $text, $matches)) {
26        $link = $matches[1];
27    } elseif (preg_match('/<\?php\s+print_link\s*\(\'([^\']+)\',\s*\'[^\']+\'\);\s*\?>/', $text, $matches)) {
28        $link = $matches[1];
29    } elseif (preg_match('/<\?php\s+print_link\s*\("([^"]+)",\s*make_image\s*\([^\)]*\)\s*\);\s*\?>/', $text, $matches)) {
30        $link = $matches[1];
31    } elseif (preg_match('/<a\s+href="([^"]*)">/', $text, $matches)) {
32        $link = $matches[1];
33    }
34
35    // Make sure it is a full URL
36    if (!preg_match('/^http:/', $link)) {
37        $rootLink = rtrim($rootLink, "/");
38        $link = ($link[0] != '/') ? "$rootLink/$link" : "$rootLink$link";
39    }
40
41    return $link;
42}
43
44// Preserve parts in the text needed and drop out everything unsusable
45function ProcessText($text) {
46
47    // Delete images, since this data will go through XML
48    $text = preg_replace('/<\?php\s+echo\s+make_image\s*\("([^"]*)",\s*"([^"]*)",\s+"([^"]*)"\);\s*\?>/i', " ", $text);
49
50    // Replace the links with <a> links (use this or the one below)
51    /*
52        $text = preg_replace('/<\?php\s+print_link\s*\("([^"]+)",\s*"([^"]+)"\);\s*\?>/', "<a href=\"$1\">$2</a>", $text); // <?php
53        $text = preg_replace('/<\?php\s+print_link\s*\(\'([^\']+)\',\s*\'([^\']+)\'\);\s*\?>/', "<a href=\"$1\">$2</a>", $text);
54        $text = preg_replace('/<\?php\s+print_link\s*\("([^"]+)",\s*make_image\s*\([^\)]*\)\s*\);\s*\?>/', "", $text);
55    */
56
57    // Remove the hyperlink references (use this or the one above)
58    $text = preg_replace('/<\?php\s+print_link\s*\("([^"]+)",\s*"([^"]+)"\);\s*\?>/', "$2", $text);
59    $text = preg_replace('/<\?php\s+print_link\s*\(\'([^\']+)\',\s*\'([^\']+)\'\);\s*\?>/', "$2", $text);
60    $text = preg_replace('/<\?php\s+print_link\s*\("([^"]+)",\s*make_image\s*\([^\)]*\)\s*\);\s*\?>/', "", $text);
61
62    // Drop HTML, trim string and drop multiple spaces
63    $text = trim(strip_tags($text));
64    return preg_replace("!\\s+!", " ", $text);
65}
66
67// Parse the index file searching for news item information
68function ParseNews ($index_page = "", $aboutLink) {
69
70    // Remove commented items
71    //$index_page = preg_replace("/<!--[\w\W]*?-->/", "", $index_page);
72
73    // Split the file by newlines
74    $lines = preg_split("/\n/", $index_page);
75    #DEBUG# print "<pre>"; print_r($lines); print "</pre>";
76
77    // Define month conversion hash
78    $mos = [
79        "Jan" => 1,  "Feb" => 2, "Mar" => 3, "Apr" => 4, "May" => 5, "Jun" => 6,
80        "Jul" => 7,  "Aug" => 8, "Sep" => 9, "Oct" => 10, "Nov" => 11, "Dec" => 12
81    ];
82
83    // We have not started to parse the
84    // news and we have no headlines right here
85    $news_started = FALSE;
86    $headlineid = 0;
87
88    // Try to classify every line backed with state information
89    // and patterns to recognize for news item elements
90    foreach ($lines as $i => $line) {
91
92        // We are not in a news item
93        if (!$news_started) {
94
95            // If we found theis comment, then we are at the right place
96            if (strpos($line, "DO NOT REMOVE THIS COMMENT")) { $news_started = TRUE; }
97            else { continue; }
98
99        // We are in a news item
100        } elseif ($news_started) {
101
102            // Headline separator reached
103            if (preg_match('!<hr />!', $line)) {
104
105                $headlineid++;
106                #DEBUG# print "<p>Info: New Headline: $headlineid<br />";
107
108            // End of headlines reached
109            } elseif (preg_match('@<a href="/archive/index.php">News Archive</a>@', $line) || strpos($line, "// NO MORE NEWS TO PARSE") === 0) {
110
111                array_pop($headlines);
112                break;
113
114            // The headline title is in <h1> tags [it needs to be be on line line!]
115            } elseif (preg_match('/<h1>(.*)<\/h1>/i', $line, $matches)) {
116
117                $headlines[$headlineid]['title'] = "$matches[1]";
118                #DEBUG# print "Title: $matches[1]<br />";
119
120            // Dates are below the headline title
121            } elseif (preg_match('/<span.*>\[(\d+)-(\S*)-(\d+)\]<\/span>/', $line, $matches)) {
122
123                $headlines[$headlineid]['date'] = mktime(1,1,1, $mos[$matches[2]], $matches[1], $matches[3]);
124                #DEBUG# print "Date: $matches[1] $matches[2] $matches[3]<br />";
125
126            // Subjects (i.e RDF category)
127            } elseif (preg_match("/<!-- SUBJECT: (.*?) -->/", $line, $matches)) {
128                $headlines[$headlineid]['subject'] = $matches[1];
129
130            // Everything else is part of the headline text
131            } else {
132                if (!preg_match('/^\s*\?>\s*$/', $line)) {
133                    if (isset($headlines[$headlineid]['text'])) {
134                        $headlines[$headlineid]['text'] .= " $line";
135                    } else {
136                        $headlines[$headlineid]['text'] = " $line";
137                    }
138                }
139            }
140        }
141    }
142
143    // Cycle through the headlines
144    foreach ($headlines as $num => $headline) {
145
146        // The first link found is THE link for the news item
147        $headlines[$num]['link'] = scanLinks($headline['text'], $aboutLink);
148
149        // And the text needs to be cleaned up
150        $headlines[$num]['text'] = ProcessText($headline['text']);
151
152        // And date needs to be reformatted
153        $headlines[$num]['date'] = date("Y-m-d", $headline['date']);
154    }
155
156    return $headlines;
157}
158
159// Generate RSS header text and inject it into $RSS
160function GenerateRSSHeader($headlines, &$RSS, $aboutLink) {
161    $RSS .= "<" . "?xml version=\"1.0\" encoding=\"utf-8\"?>\n" .
162            "<rdf:RDF\n" .
163            "\txmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"\n" .
164            "\txmlns=\"http://purl.org/rss/1.0/\"\n" .
165            "\txmlns:dc=\"http://purl.org/dc/elements/1.1/\"\n" .
166            ">\n" .
167            "<channel rdf:about=\"$aboutLink\">\n" .
168            "\t<title>PHP: Hypertext Preprocessor</title>\n" .
169            "\t<link>$aboutLink</link>\n" .
170            "\t<description>The PHP scripting language web site</description>\n" .
171            "\t<items>\n" .
172            "\t\t<rdf:Seq>\n";
173
174    // Cycle through all the Resources on the RSS
175    foreach ((array)$headlines as $headline) {
176        $RSS .= "\t\t\t<rdf:li rdf:resource=\"" . $headline['link'] . "\" />\n";
177    }
178
179    $RSS .= "\t\t</rdf:Seq>\n\t</items>\n</channel>\n";
180}
181
182// Add RSS footer information to $RSS
183function GenerateRSSFooter(&$RSS) {
184    $RSS .= "</rdf:RDF>\n";
185}
186
187// Add an RSS item's information to $RSS
188function GenerateRSSItem($href, $title, $text, $date, $subject, &$RSS) {
189    if($subject) {
190        $s = "\t<dc:subject>$subject</dc:subject>\n";
191    } else {
192        $s = "";
193    }
194    $RSS .= "\n<item rdf:about=\"$href\">\n" .
195            "\t<title>$title</title>\n" .
196            "\t<link>$href</link>\n" .
197            $s .
198            "\t<description>$text</description>\n" .
199            "\t<dc:date>" . $date . "</dc:date>\n" .
200            "</item>\n";
201}
202
203function GenerateRSSFile($root, $aboutLink) {
204// Get the PHP.net index page's source code
205$homepage = getData("$root/index.php");
206
207// This returns a data structure containing all the news items found
208$hlines = ParseNews($homepage, $aboutLink);
209
210// Start with an empty RSS string
211$RSS = '';
212
213// Generate the RSS Header
214GenerateRSSHeader($hlines, $RSS, $aboutLink);
215
216// Add separator comment
217$RSS .= "<!-- RSS-Items -->\n";
218
219// Add every news item to the feed
220foreach ($hlines as $hline) {
221    GenerateRSSItem($hline['link'], $hline['title'], $hline['text'], $hline['date'], isset($hline['subject']) ? $hline['subject'] : false, $RSS);
222}
223
224// Add end separator
225$RSS .= "<!-- / RSS-Items PHP/RSS -->\n";
226
227// Dump the last XML tag
228GenerateRSSFooter($RSS);
229
230#DEBUG# echo $RSS;
231return $RSS;
232}
233
234#$RSSNews = GenerateRSSFile($root, "http://php.net/");
235#$RSSConf = GenerateRSSFile("$root/conferences", "http://php.net/conferences/");
236
237?>
238