1<?php 2 /* 3 * urlgrab5.php 4 * 5 * A simple command-line utility to extract all of the URLS contained 6 * within <A HREF> tags from a document. 7 * 8 * NOTE: Only works with tidy for PHP 5, please see urlgrab.php for tidy for PHP 4.3.x 9 * 10 * By: John Coggeshall <john@php.net> 11 * 12 * Usage: php urlgrab5.php <file> 13 * 14 */ 15 function dump_nodes(tidyNode $node, &$urls = NULL) { 16 17 $urls = (is_array($urls)) ? $urls : array(); 18 19 if(isset($node->id)) { 20 if($node->id == TIDY_TAG_A) { 21 $urls[] = $node->attribute['href']; 22 } 23 } 24 25 if($node->hasChildren()) { 26 27 foreach($node->child as $c) { 28 dump_nodes($c, $urls); 29 } 30 31 } 32 33 return $urls; 34 } 35 36 $a = tidy_parse_file($_SERVER['argv'][1]); 37 $a->cleanRepair(); 38 print_r(dump_nodes($a->html())); 39?> 40