Scraper scripts often need to extract all links on a given page. This can be done in a number of ways like regex, domdocument etc.
Here is simple code snippet to do this using domdocument.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
| /* Function to get all links on a certain url using the DomDocument */ function get_links( $link ) { //return array $ret = array (); /*** a new dom object ***/ $dom = new domDocument; /*** get the HTML (suppress errors) ***/ @ $dom ->loadHTML( file_get_contents ( $link )); /*** remove silly white space ***/ $dom ->preserveWhiteSpace = false; /*** get the links from the HTML ***/ $links = $dom ->getElementsByTagName( 'a' ); /*** loop over the links ***/ foreach ( $links as $tag ) { $ret [ $tag ->getAttribute( 'href' )] = $tag ->childNodes->item(0)->nodeValue; } return $ret ; } //Link to open and search for links /*** get the links ***/ $urls = get_links( $link ); /*** check for results ***/ if (sizeof( $urls ) > 0) { foreach ( $urls as $key => $value ) { echo $key . ' - ' . $value . '<br >' ; } } else { echo "No links found at $link" ; } |
0 comments:
Post a Comment