colonymech / docs / www / colonyscout / internal / simple_html_dom.php @ f59acf11
History | View | Annotate | Download (34.3 KB)
1 | f59acf11 | Dan Shope | <?php
|
---|---|---|---|
2 | /*******************************************************************************
|
||
3 | Version: 1.11 ($Rev: 175 $)
|
||
4 | Website: http://sourceforge.net/projects/simplehtmldom/
|
||
5 | Author: S.C. Chen <me578022@gmail.com>
|
||
6 | Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/)
|
||
7 | Contributions by:
|
||
8 | Yousuke Kumakura (Attribute filters)
|
||
9 | Vadim Voituk (Negative indexes supports of "find" method)
|
||
10 | Antcs (Constructor with automatically load contents either text or file/url)
|
||
11 | Licensed under The MIT License
|
||
12 | Redistributions of files must retain the above copyright notice.
|
||
13 | *******************************************************************************/
|
||
14 | |||
15 | define('HDOM_TYPE_ELEMENT', 1); |
||
16 | define('HDOM_TYPE_COMMENT', 2); |
||
17 | define('HDOM_TYPE_TEXT', 3); |
||
18 | define('HDOM_TYPE_ENDTAG', 4); |
||
19 | define('HDOM_TYPE_ROOT', 5); |
||
20 | define('HDOM_TYPE_UNKNOWN', 6); |
||
21 | define('HDOM_QUOTE_DOUBLE', 0); |
||
22 | define('HDOM_QUOTE_SINGLE', 1); |
||
23 | define('HDOM_QUOTE_NO', 3); |
||
24 | define('HDOM_INFO_BEGIN', 0); |
||
25 | define('HDOM_INFO_END', 1); |
||
26 | define('HDOM_INFO_QUOTE', 2); |
||
27 | define('HDOM_INFO_SPACE', 3); |
||
28 | define('HDOM_INFO_TEXT', 4); |
||
29 | define('HDOM_INFO_INNER', 5); |
||
30 | define('HDOM_INFO_OUTER', 6); |
||
31 | define('HDOM_INFO_ENDSPACE',7); |
||
32 | |||
33 | // helper functions
|
||
34 | // -----------------------------------------------------------------------------
|
||
35 | // get html dom form file
|
||
36 | function file_get_html() { |
||
37 | $dom = new simple_html_dom; |
||
38 | $args = func_get_args(); |
||
39 | $dom->load(call_user_func_array('file_get_contents', $args), true); |
||
40 | return $dom; |
||
41 | } |
||
42 | |||
43 | // get html dom form string
|
||
44 | function str_get_html($str, $lowercase=true) { |
||
45 | $dom = new simple_html_dom; |
||
46 | $dom->load($str, $lowercase); |
||
47 | return $dom; |
||
48 | } |
||
49 | |||
50 | // dump html dom tree
|
||
51 | function dump_html_tree($node, $show_attr=true, $deep=0) { |
||
52 | $lead = str_repeat(' ', $deep); |
||
53 | echo $lead.$node->tag; |
||
54 | if ($show_attr && count($node->attr)>0) { |
||
55 | echo '('; |
||
56 | foreach($node->attr as $k=>$v) |
||
57 | echo "[$k]=>\"".$node->$k.'", '; |
||
58 | echo ')'; |
||
59 | } |
||
60 | echo "\n"; |
||
61 | |||
62 | foreach($node->nodes as $c) |
||
63 | dump_html_tree($c, $show_attr, $deep+1); |
||
64 | } |
||
65 | |||
66 | // get dom form file (deprecated)
|
||
67 | function file_get_dom() { |
||
68 | $dom = new simple_html_dom; |
||
69 | $args = func_get_args(); |
||
70 | $dom->load(call_user_func_array('file_get_contents', $args), true); |
||
71 | return $dom; |
||
72 | } |
||
73 | |||
74 | // get dom form string (deprecated)
|
||
75 | function str_get_dom($str, $lowercase=true) { |
||
76 | $dom = new simple_html_dom; |
||
77 | $dom->load($str, $lowercase); |
||
78 | return $dom; |
||
79 | } |
||
80 | |||
81 | // simple html dom node
|
||
82 | // -----------------------------------------------------------------------------
|
||
83 | class simple_html_dom_node { |
||
84 | public $nodetype = HDOM_TYPE_TEXT; |
||
85 | public $tag = 'text'; |
||
86 | public $attr = array(); |
||
87 | public $children = array(); |
||
88 | public $nodes = array(); |
||
89 | public $parent = null; |
||
90 | public $_ = array(); |
||
91 | private $dom = null; |
||
92 | |||
93 | function __construct($dom) { |
||
94 | $this->dom = $dom; |
||
95 | $dom->nodes[] = $this; |
||
96 | } |
||
97 | |||
98 | function __destruct() { |
||
99 | $this->clear();
|
||
100 | } |
||
101 | |||
102 | function __toString() { |
||
103 | return $this->outertext(); |
||
104 | } |
||
105 | |||
106 | // clean up memory due to php5 circular references memory leak...
|
||
107 | function clear() { |
||
108 | $this->dom = null; |
||
109 | $this->nodes = null; |
||
110 | $this->parent = null; |
||
111 | $this->children = null; |
||
112 | } |
||
113 | |||
114 | // dump node's tree
|
||
115 | function dump($show_attr=true) { |
||
116 | dump_html_tree($this, $show_attr); |
||
117 | } |
||
118 | |||
119 | // returns the parent of node
|
||
120 | function parent() { |
||
121 | return $this->parent; |
||
122 | } |
||
123 | |||
124 | // returns children of node
|
||
125 | function children($idx=-1) { |
||
126 | if ($idx===-1) return $this->children; |
||
127 | if (isset($this->children[$idx])) return $this->children[$idx]; |
||
128 | return null; |
||
129 | } |
||
130 | |||
131 | // returns the first child of node
|
||
132 | function first_child() { |
||
133 | if (count($this->children)>0) return $this->children[0]; |
||
134 | return null; |
||
135 | } |
||
136 | |||
137 | // returns the last child of node
|
||
138 | function last_child() { |
||
139 | if (($count=count($this->children))>0) return $this->children[$count-1]; |
||
140 | return null; |
||
141 | } |
||
142 | |||
143 | // returns the next sibling of node
|
||
144 | function next_sibling() { |
||
145 | if ($this->parent===null) return null; |
||
146 | $idx = 0; |
||
147 | $count = count($this->parent->children); |
||
148 | while ($idx<$count && $this!==$this->parent->children[$idx]) |
||
149 | ++$idx;
|
||
150 | if (++$idx>=$count) return null; |
||
151 | return $this->parent->children[$idx]; |
||
152 | } |
||
153 | |||
154 | // returns the previous sibling of node
|
||
155 | function prev_sibling() { |
||
156 | if ($this->parent===null) return null; |
||
157 | $idx = 0; |
||
158 | $count = count($this->parent->children); |
||
159 | while ($idx<$count && $this!==$this->parent->children[$idx]) |
||
160 | ++$idx;
|
||
161 | if (--$idx<0) return null; |
||
162 | return $this->parent->children[$idx]; |
||
163 | } |
||
164 | |||
165 | // get dom node's inner html
|
||
166 | function innertext() { |
||
167 | if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER]; |
||
168 | if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); |
||
169 | |||
170 | $ret = ''; |
||
171 | foreach($this->nodes as $n) |
||
172 | $ret .= $n->outertext(); |
||
173 | return $ret; |
||
174 | } |
||
175 | |||
176 | // get dom node's outer text (with tag)
|
||
177 | function outertext() { |
||
178 | if ($this->tag==='root') return $this->innertext(); |
||
179 | |||
180 | // trigger callback
|
||
181 | if ($this->dom->callback!==null) |
||
182 | call_user_func_array($this->dom->callback, array($this)); |
||
183 | |||
184 | if (isset($this->_[HDOM_INFO_OUTER])) return $this->_[HDOM_INFO_OUTER]; |
||
185 | if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); |
||
186 | |||
187 | // render begin tag
|
||
188 | $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup(); |
||
189 | |||
190 | // render inner text
|
||
191 | if (isset($this->_[HDOM_INFO_INNER])) |
||
192 | $ret .= $this->_[HDOM_INFO_INNER]; |
||
193 | else {
|
||
194 | foreach($this->nodes as $n) |
||
195 | $ret .= $n->outertext(); |
||
196 | } |
||
197 | |||
198 | // render end tag
|
||
199 | if(isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END]!=0) |
||
200 | $ret .= '</'.$this->tag.'>'; |
||
201 | return $ret; |
||
202 | } |
||
203 | |||
204 | // get dom node's plain text
|
||
205 | function text() { |
||
206 | if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER]; |
||
207 | switch ($this->nodetype) { |
||
208 | case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); |
||
209 | case HDOM_TYPE_COMMENT: return ''; |
||
210 | case HDOM_TYPE_UNKNOWN: return ''; |
||
211 | } |
||
212 | if (strcasecmp($this->tag, 'script')===0) return ''; |
||
213 | if (strcasecmp($this->tag, 'style')===0) return ''; |
||
214 | |||
215 | $ret = ''; |
||
216 | foreach($this->nodes as $n) |
||
217 | $ret .= $n->text(); |
||
218 | return $ret; |
||
219 | } |
||
220 | |||
221 | function xmltext() { |
||
222 | $ret = $this->innertext(); |
||
223 | $ret = str_ireplace('<![CDATA[', '', $ret); |
||
224 | $ret = str_replace(']]>', '', $ret); |
||
225 | return $ret; |
||
226 | } |
||
227 | |||
228 | // build node's text with tag
|
||
229 | function makeup() { |
||
230 | // text, comment, unknown
|
||
231 | if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); |
||
232 | |||
233 | $ret = '<'.$this->tag; |
||
234 | $i = -1; |
||
235 | |||
236 | foreach($this->attr as $key=>$val) { |
||
237 | ++$i;
|
||
238 | |||
239 | // skip removed attribute
|
||
240 | if ($val===null || $val===false) |
||
241 | continue;
|
||
242 | |||
243 | $ret .= $this->_[HDOM_INFO_SPACE][$i][0]; |
||
244 | //no value attr: nowrap, checked selected...
|
||
245 | if ($val===true) |
||
246 | $ret .= $key; |
||
247 | else {
|
||
248 | switch($this->_[HDOM_INFO_QUOTE][$i]) { |
||
249 | case HDOM_QUOTE_DOUBLE: $quote = '"'; break; |
||
250 | case HDOM_QUOTE_SINGLE: $quote = '\''; break; |
||
251 | default: $quote = ''; |
||
252 | } |
||
253 | $ret .= $key.$this->_[HDOM_INFO_SPACE][$i][1].'='.$this->_[HDOM_INFO_SPACE][$i][2].$quote.$val.$quote; |
||
254 | } |
||
255 | } |
||
256 | $ret = $this->dom->restore_noise($ret); |
||
257 | return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>'; |
||
258 | } |
||
259 | |||
260 | // find elements by css selector
|
||
261 | function find($selector, $idx=null) { |
||
262 | $selectors = $this->parse_selector($selector); |
||
263 | if (($count=count($selectors))===0) return array(); |
||
264 | $found_keys = array(); |
||
265 | |||
266 | // find each selector
|
||
267 | for ($c=0; $c<$count; ++$c) { |
||
268 | if (($levle=count($selectors[0]))===0) return array(); |
||
269 | if (!isset($this->_[HDOM_INFO_BEGIN])) return array(); |
||
270 | |||
271 | $head = array($this->_[HDOM_INFO_BEGIN]=>1); |
||
272 | |||
273 | // handle descendant selectors, no recursive!
|
||
274 | for ($l=0; $l<$levle; ++$l) { |
||
275 | $ret = array(); |
||
276 | foreach($head as $k=>$v) { |
||
277 | $n = ($k===-1) ? $this->dom->root : $this->dom->nodes[$k]; |
||
278 | $n->seek($selectors[$c][$l], $ret); |
||
279 | } |
||
280 | $head = $ret; |
||
281 | } |
||
282 | |||
283 | foreach($head as $k=>$v) { |
||
284 | if (!isset($found_keys[$k])) |
||
285 | $found_keys[$k] = 1; |
||
286 | } |
||
287 | } |
||
288 | |||
289 | // sort keys
|
||
290 | ksort($found_keys); |
||
291 | |||
292 | $found = array(); |
||
293 | foreach($found_keys as $k=>$v) |
||
294 | $found[] = $this->dom->nodes[$k]; |
||
295 | |||
296 | // return nth-element or array
|
||
297 | if (is_null($idx)) return $found; |
||
298 | else if ($idx<0) $idx = count($found) + $idx; |
||
299 | return (isset($found[$idx])) ? $found[$idx] : null; |
||
300 | } |
||
301 | |||
302 | // seek for given conditions
|
||
303 | protected function seek($selector, &$ret) { |
||
304 | list($tag, $key, $val, $exp, $no_key) = $selector; |
||
305 | |||
306 | // xpath index
|
||
307 | if ($tag && $key && is_numeric($key)) { |
||
308 | $count = 0; |
||
309 | foreach ($this->children as $c) { |
||
310 | if ($tag==='*' || $tag===$c->tag) { |
||
311 | if (++$count==$key) { |
||
312 | $ret[$c->_[HDOM_INFO_BEGIN]] = 1; |
||
313 | return;
|
||
314 | } |
||
315 | } |
||
316 | } |
||
317 | return;
|
||
318 | } |
||
319 | |||
320 | $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0; |
||
321 | if ($end==0) { |
||
322 | $parent = $this->parent; |
||
323 | while (!isset($parent->_[HDOM_INFO_END]) && $parent!==null) { |
||
324 | $end -= 1; |
||
325 | $parent = $parent->parent; |
||
326 | } |
||
327 | $end += $parent->_[HDOM_INFO_END]; |
||
328 | } |
||
329 | |||
330 | for($i=$this->_[HDOM_INFO_BEGIN]+1; $i<$end; ++$i) { |
||
331 | $node = $this->dom->nodes[$i]; |
||
332 | $pass = true; |
||
333 | |||
334 | if ($tag==='*' && !$key) { |
||
335 | if (in_array($node, $this->children, true)) |
||
336 | $ret[$i] = 1; |
||
337 | continue;
|
||
338 | } |
||
339 | |||
340 | // compare tag
|
||
341 | if ($tag && $tag!=$node->tag && $tag!=='*') {$pass=false;} |
||
342 | // compare key
|
||
343 | if ($pass && $key) { |
||
344 | if ($no_key) { |
||
345 | if (isset($node->attr[$key])) $pass=false; |
||
346 | } |
||
347 | else if (!isset($node->attr[$key])) $pass=false; |
||
348 | } |
||
349 | // compare value
|
||
350 | if ($pass && $key && $val && $val!=='*') { |
||
351 | $check = $this->match($exp, $val, $node->attr[$key]); |
||
352 | // handle multiple class
|
||
353 | if (!$check && strcasecmp($key, 'class')===0) { |
||
354 | foreach(explode(' ',$node->attr[$key]) as $k) { |
||
355 | $check = $this->match($exp, $val, $k); |
||
356 | if ($check) break; |
||
357 | } |
||
358 | } |
||
359 | if (!$check) $pass = false; |
||
360 | } |
||
361 | if ($pass) $ret[$i] = 1; |
||
362 | unset($node); |
||
363 | } |
||
364 | } |
||
365 | |||
366 | protected function match($exp, $pattern, $value) { |
||
367 | switch ($exp) { |
||
368 | case '=': |
||
369 | return ($value===$pattern); |
||
370 | case '!=': |
||
371 | return ($value!==$pattern); |
||
372 | case '^=': |
||
373 | return preg_match("/^".preg_quote($pattern,'/')."/", $value); |
||
374 | case '$=': |
||
375 | return preg_match("/".preg_quote($pattern,'/')."$/", $value); |
||
376 | case '*=': |
||
377 | if ($pattern[0]=='/') |
||
378 | return preg_match($pattern, $value); |
||
379 | return preg_match("/".$pattern."/i", $value); |
||
380 | } |
||
381 | return false; |
||
382 | } |
||
383 | |||
384 | protected function parse_selector($selector_string) { |
||
385 | // pattern of CSS selectors, modified from mootools
|
||
386 | $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; |
||
387 | preg_match_all($pattern, trim($selector_string).' ', $matches, PREG_SET_ORDER); |
||
388 | $selectors = array(); |
||
389 | $result = array(); |
||
390 | //print_r($matches);
|
||
391 | |||
392 | foreach ($matches as $m) { |
||
393 | $m[0] = trim($m[0]); |
||
394 | if ($m[0]==='' || $m[0]==='/' || $m[0]==='//') continue; |
||
395 | // for borwser grnreated xpath
|
||
396 | if ($m[1]==='tbody') continue; |
||
397 | |||
398 | list($tag, $key, $val, $exp, $no_key) = array($m[1], null, null, '=', false); |
||
399 | if(!empty($m[2])) {$key='id'; $val=$m[2];} |
||
400 | if(!empty($m[3])) {$key='class'; $val=$m[3];} |
||
401 | if(!empty($m[4])) {$key=$m[4];} |
||
402 | if(!empty($m[5])) {$exp=$m[5];} |
||
403 | if(!empty($m[6])) {$val=$m[6];} |
||
404 | |||
405 | // convert to lowercase
|
||
406 | if ($this->dom->lowercase) {$tag=strtolower($tag); $key=strtolower($key);} |
||
407 | //elements that do NOT have the specified attribute
|
||
408 | if (isset($key[0]) && $key[0]==='!') {$key=substr($key, 1); $no_key=true;} |
||
409 | |||
410 | $result[] = array($tag, $key, $val, $exp, $no_key); |
||
411 | if (trim($m[7])===',') { |
||
412 | $selectors[] = $result; |
||
413 | $result = array(); |
||
414 | } |
||
415 | } |
||
416 | if (count($result)>0) |
||
417 | $selectors[] = $result; |
||
418 | return $selectors; |
||
419 | } |
||
420 | |||
421 | function __get($name) { |
||
422 | if (isset($this->attr[$name])) return $this->attr[$name]; |
||
423 | switch($name) { |
||
424 | case 'outertext': return $this->outertext(); |
||
425 | case 'innertext': return $this->innertext(); |
||
426 | case 'plaintext': return $this->text(); |
||
427 | case 'xmltext': return $this->xmltext(); |
||
428 | default: return array_key_exists($name, $this->attr); |
||
429 | } |
||
430 | } |
||
431 | |||
432 | function __set($name, $value) { |
||
433 | switch($name) { |
||
434 | case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value; |
||
435 | case 'innertext': |
||
436 | if (isset($this->_[HDOM_INFO_TEXT])) return $this->_[HDOM_INFO_TEXT] = $value; |
||
437 | return $this->_[HDOM_INFO_INNER] = $value; |
||
438 | } |
||
439 | if (!isset($this->attr[$name])) { |
||
440 | $this->_[HDOM_INFO_SPACE][] = array(' ', '', ''); |
||
441 | $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE; |
||
442 | } |
||
443 | $this->attr[$name] = $value; |
||
444 | } |
||
445 | |||
446 | function __isset($name) { |
||
447 | switch($name) { |
||
448 | case 'outertext': return true; |
||
449 | case 'innertext': return true; |
||
450 | case 'plaintext': return true; |
||
451 | } |
||
452 | //no value attr: nowrap, checked selected...
|
||
453 | return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]); |
||
454 | } |
||
455 | |||
456 | function __unset($name) { |
||
457 | if (isset($this->attr[$name])) |
||
458 | unset($this->attr[$name]); |
||
459 | } |
||
460 | |||
461 | // camel naming conventions
|
||
462 | function getAllAttributes() {return $this->attr;} |
||
463 | function getAttribute($name) {return $this->__get($name);} |
||
464 | function setAttribute($name, $value) {$this->__set($name, $value);} |
||
465 | function hasAttribute($name) {return $this->__isset($name);} |
||
466 | function removeAttribute($name) {$this->__set($name, null);} |
||
467 | function getElementById($id) {return $this->find("#$id", 0);} |
||
468 | function getElementsById($id, $idx=null) {return $this->find("#$id", $idx);} |
||
469 | function getElementByTagName($name) {return $this->find($name, 0);} |
||
470 | function getElementsByTagName($name, $idx=null) {return $this->find($name, $idx);} |
||
471 | function parentNode() {return $this->parent();} |
||
472 | function childNodes($idx=-1) {return $this->children($idx);} |
||
473 | function firstChild() {return $this->first_child();} |
||
474 | function lastChild() {return $this->last_child();} |
||
475 | function nextSibling() {return $this->next_sibling();} |
||
476 | function previousSibling() {return $this->prev_sibling();} |
||
477 | } |
||
478 | |||
479 | // simple html dom parser
|
||
480 | // -----------------------------------------------------------------------------
|
||
481 | class simple_html_dom { |
||
482 | public $root = null; |
||
483 | public $nodes = array(); |
||
484 | public $callback = null; |
||
485 | public $lowercase = false; |
||
486 | protected $pos; |
||
487 | protected $doc; |
||
488 | protected $char; |
||
489 | protected $size; |
||
490 | protected $cursor; |
||
491 | protected $parent; |
||
492 | protected $noise = array(); |
||
493 | protected $token_blank = " \t\r\n"; |
||
494 | protected $token_equal = ' =/>'; |
||
495 | protected $token_slash = " />\r\n\t"; |
||
496 | protected $token_attr = ' >'; |
||
497 | // use isset instead of in_array, performance boost about 30%...
|
||
498 | protected $self_closing_tags = array('img'=>1, 'br'=>1, 'input'=>1, 'meta'=>1, 'link'=>1, 'hr'=>1, 'base'=>1, 'embed'=>1, 'spacer'=>1); |
||
499 | protected $block_tags = array('root'=>1, 'body'=>1, 'form'=>1, 'div'=>1, 'span'=>1, 'table'=>1); |
||
500 | protected $optional_closing_tags = array( |
||
501 | 'tr'=>array('tr'=>1, 'td'=>1, 'th'=>1), |
||
502 | 'th'=>array('th'=>1), |
||
503 | 'td'=>array('td'=>1), |
||
504 | 'li'=>array('li'=>1), |
||
505 | 'dt'=>array('dt'=>1, 'dd'=>1), |
||
506 | 'dd'=>array('dd'=>1, 'dt'=>1), |
||
507 | 'dl'=>array('dd'=>1, 'dt'=>1), |
||
508 | 'p'=>array('p'=>1), |
||
509 | 'nobr'=>array('nobr'=>1), |
||
510 | ); |
||
511 | |||
512 | function __construct($str=null) { |
||
513 | if ($str) { |
||
514 | if (preg_match("/^http:\/\//i",$str) || is_file($str)) |
||
515 | $this->load_file($str); |
||
516 | else
|
||
517 | $this->load($str); |
||
518 | } |
||
519 | } |
||
520 | |||
521 | function __destruct() { |
||
522 | $this->clear();
|
||
523 | } |
||
524 | |||
525 | // load html from string
|
||
526 | function load($str, $lowercase=true) { |
||
527 | // prepare
|
||
528 | $this->prepare($str, $lowercase); |
||
529 | // strip out comments
|
||
530 | $this->remove_noise("'<!--(.*?)-->'is"); |
||
531 | // strip out cdata
|
||
532 | $this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true); |
||
533 | // strip out <style> tags
|
||
534 | $this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is"); |
||
535 | $this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is"); |
||
536 | // strip out <script> tags
|
||
537 | $this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is"); |
||
538 | $this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is"); |
||
539 | // strip out preformatted tags
|
||
540 | $this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is"); |
||
541 | // strip out server side scripts
|
||
542 | $this->remove_noise("'(<\?)(.*?)(\?>)'s", true); |
||
543 | // strip smarty scripts
|
||
544 | $this->remove_noise("'(\{\w)(.*?)(\})'s", true); |
||
545 | |||
546 | // parsing
|
||
547 | while ($this->parse()); |
||
548 | // end
|
||
549 | $this->root->_[HDOM_INFO_END] = $this->cursor; |
||
550 | } |
||
551 | |||
552 | // load html from file
|
||
553 | function load_file() { |
||
554 | $args = func_get_args(); |
||
555 | $this->load(call_user_func_array('file_get_contents', $args), true); |
||
556 | } |
||
557 | |||
558 | // set callback function
|
||
559 | function set_callback($function_name) { |
||
560 | $this->callback = $function_name; |
||
561 | } |
||
562 | |||
563 | // remove callback function
|
||
564 | function remove_callback() { |
||
565 | $this->callback = null; |
||
566 | } |
||
567 | |||
568 | // save dom as string
|
||
569 | function save($filepath='') { |
||
570 | $ret = $this->root->innertext(); |
||
571 | if ($filepath!=='') file_put_contents($filepath, $ret); |
||
572 | return $ret; |
||
573 | } |
||
574 | |||
575 | // find dom node by css selector
|
||
576 | function find($selector, $idx=null) { |
||
577 | return $this->root->find($selector, $idx); |
||
578 | } |
||
579 | |||
580 | // clean up memory due to php5 circular references memory leak...
|
||
581 | function clear() { |
||
582 | foreach($this->nodes as $n) {$n->clear(); $n = null;} |
||
583 | if (isset($this->parent)) {$this->parent->clear(); unset($this->parent);} |
||
584 | if (isset($this->root)) {$this->root->clear(); unset($this->root);} |
||
585 | unset($this->doc); |
||
586 | unset($this->noise); |
||
587 | } |
||
588 | |||
589 | function dump($show_attr=true) { |
||
590 | $this->root->dump($show_attr); |
||
591 | } |
||
592 | |||
593 | // prepare HTML data and init everything
|
||
594 | protected function prepare($str, $lowercase=true) { |
||
595 | $this->clear();
|
||
596 | $this->doc = $str; |
||
597 | $this->pos = 0; |
||
598 | $this->cursor = 1; |
||
599 | $this->noise = array(); |
||
600 | $this->nodes = array(); |
||
601 | $this->lowercase = $lowercase; |
||
602 | $this->root = new simple_html_dom_node($this); |
||
603 | $this->root->tag = 'root'; |
||
604 | $this->root->_[HDOM_INFO_BEGIN] = -1; |
||
605 | $this->root->nodetype = HDOM_TYPE_ROOT; |
||
606 | $this->parent = $this->root; |
||
607 | // set the length of content
|
||
608 | $this->size = strlen($str); |
||
609 | if ($this->size>0) $this->char = $this->doc[0]; |
||
610 | } |
||
611 | |||
612 | // parse html content
|
||
613 | protected function parse() { |
||
614 | if (($s = $this->copy_until_char('<'))==='') |
||
615 | return $this->read_tag(); |
||
616 | |||
617 | // text
|
||
618 | $node = new simple_html_dom_node($this); |
||
619 | ++$this->cursor;
|
||
620 | $node->_[HDOM_INFO_TEXT] = $s; |
||
621 | $this->link_nodes($node, false); |
||
622 | return true; |
||
623 | } |
||
624 | |||
625 | // read tag info
|
||
626 | protected function read_tag() { |
||
627 | if ($this->char!=='<') { |
||
628 | $this->root->_[HDOM_INFO_END] = $this->cursor; |
||
629 | return false; |
||
630 | } |
||
631 | $begin_tag_pos = $this->pos; |
||
632 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||
633 | |||
634 | // end tag
|
||
635 | if ($this->char==='/') { |
||
636 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||
637 | $this->skip($this->token_blank_t); |
||
638 | $tag = $this->copy_until_char('>'); |
||
639 | |||
640 | // skip attributes in end tag
|
||
641 | if (($pos = strpos($tag, ' '))!==false) |
||
642 | $tag = substr($tag, 0, $pos); |
||
643 | |||
644 | $parent_lower = strtolower($this->parent->tag); |
||
645 | $tag_lower = strtolower($tag); |
||
646 | |||
647 | if ($parent_lower!==$tag_lower) { |
||
648 | if (isset($this->optional_closing_tags[$parent_lower]) && isset($this->block_tags[$tag_lower])) { |
||
649 | $this->parent->_[HDOM_INFO_END] = 0; |
||
650 | $org_parent = $this->parent; |
||
651 | |||
652 | while (($this->parent->parent) && strtolower($this->parent->tag)!==$tag_lower) |
||
653 | $this->parent = $this->parent->parent; |
||
654 | |||
655 | if (strtolower($this->parent->tag)!==$tag_lower) { |
||
656 | $this->parent = $org_parent; // restore origonal parent |
||
657 | if ($this->parent->parent) $this->parent = $this->parent->parent; |
||
658 | $this->parent->_[HDOM_INFO_END] = $this->cursor; |
||
659 | return $this->as_text_node($tag); |
||
660 | } |
||
661 | } |
||
662 | else if (($this->parent->parent) && isset($this->block_tags[$tag_lower])) { |
||
663 | $this->parent->_[HDOM_INFO_END] = 0; |
||
664 | $org_parent = $this->parent; |
||
665 | |||
666 | while (($this->parent->parent) && strtolower($this->parent->tag)!==$tag_lower) |
||
667 | $this->parent = $this->parent->parent; |
||
668 | |||
669 | if (strtolower($this->parent->tag)!==$tag_lower) { |
||
670 | $this->parent = $org_parent; // restore origonal parent |
||
671 | $this->parent->_[HDOM_INFO_END] = $this->cursor; |
||
672 | return $this->as_text_node($tag); |
||
673 | } |
||
674 | } |
||
675 | else if (($this->parent->parent) && strtolower($this->parent->parent->tag)===$tag_lower) { |
||
676 | $this->parent->_[HDOM_INFO_END] = 0; |
||
677 | $this->parent = $this->parent->parent; |
||
678 | } |
||
679 | else
|
||
680 | return $this->as_text_node($tag); |
||
681 | } |
||
682 | |||
683 | $this->parent->_[HDOM_INFO_END] = $this->cursor; |
||
684 | if ($this->parent->parent) $this->parent = $this->parent->parent; |
||
685 | |||
686 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||
687 | return true; |
||
688 | } |
||
689 | |||
690 | $node = new simple_html_dom_node($this); |
||
691 | $node->_[HDOM_INFO_BEGIN] = $this->cursor; |
||
692 | ++$this->cursor;
|
||
693 | $tag = $this->copy_until($this->token_slash); |
||
694 | |||
695 | // doctype, cdata & comments...
|
||
696 | if (isset($tag[0]) && $tag[0]==='!') { |
||
697 | $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>'); |
||
698 | |||
699 | if (isset($tag[2]) && $tag[1]==='-' && $tag[2]==='-') { |
||
700 | $node->nodetype = HDOM_TYPE_COMMENT; |
||
701 | $node->tag = 'comment'; |
||
702 | } else {
|
||
703 | $node->nodetype = HDOM_TYPE_UNKNOWN; |
||
704 | $node->tag = 'unknown'; |
||
705 | } |
||
706 | |||
707 | if ($this->char==='>') $node->_[HDOM_INFO_TEXT].='>'; |
||
708 | $this->link_nodes($node, true); |
||
709 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||
710 | return true; |
||
711 | } |
||
712 | |||
713 | // text
|
||
714 | if ($pos=strpos($tag, '<')!==false) { |
||
715 | $tag = '<' . substr($tag, 0, -1); |
||
716 | $node->_[HDOM_INFO_TEXT] = $tag; |
||
717 | $this->link_nodes($node, false); |
||
718 | $this->char = $this->doc[--$this->pos]; // prev |
||
719 | return true; |
||
720 | } |
||
721 | |||
722 | if (!preg_match("/^[\w-:]+$/", $tag)) { |
||
723 | $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>'); |
||
724 | if ($this->char==='<') { |
||
725 | $this->link_nodes($node, false); |
||
726 | return true; |
||
727 | } |
||
728 | |||
729 | if ($this->char==='>') $node->_[HDOM_INFO_TEXT].='>'; |
||
730 | $this->link_nodes($node, false); |
||
731 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||
732 | return true; |
||
733 | } |
||
734 | |||
735 | // begin tag
|
||
736 | $node->nodetype = HDOM_TYPE_ELEMENT; |
||
737 | $tag_lower = strtolower($tag); |
||
738 | $node->tag = ($this->lowercase) ? $tag_lower : $tag; |
||
739 | |||
740 | // handle optional closing tags
|
||
741 | if (isset($this->optional_closing_tags[$tag_lower]) ) { |
||
742 | while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)])) { |
||
743 | $this->parent->_[HDOM_INFO_END] = 0; |
||
744 | $this->parent = $this->parent->parent; |
||
745 | } |
||
746 | $node->parent = $this->parent; |
||
747 | } |
||
748 | |||
749 | $guard = 0; // prevent infinity loop |
||
750 | $space = array($this->copy_skip($this->token_blank), '', ''); |
||
751 | |||
752 | // attributes
|
||
753 | do {
|
||
754 | if ($this->char!==null && $space[0]==='') break; |
||
755 | $name = $this->copy_until($this->token_equal); |
||
756 | if($guard===$this->pos) { |
||
757 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||
758 | continue;
|
||
759 | } |
||
760 | $guard = $this->pos; |
||
761 | |||
762 | // handle endless '<'
|
||
763 | if($this->pos>=$this->size-1 && $this->char!=='>') { |
||
764 | $node->nodetype = HDOM_TYPE_TEXT; |
||
765 | $node->_[HDOM_INFO_END] = 0; |
||
766 | $node->_[HDOM_INFO_TEXT] = '<'.$tag . $space[0] . $name; |
||
767 | $node->tag = 'text'; |
||
768 | $this->link_nodes($node, false); |
||
769 | return true; |
||
770 | } |
||
771 | |||
772 | // handle mismatch '<'
|
||
773 | if($this->doc[$this->pos-1]=='<') { |
||
774 | $node->nodetype = HDOM_TYPE_TEXT; |
||
775 | $node->tag = 'text'; |
||
776 | $node->attr = array(); |
||
777 | $node->_[HDOM_INFO_END] = 0; |
||
778 | $node->_[HDOM_INFO_TEXT] = substr($this->doc, $begin_tag_pos, $this->pos-$begin_tag_pos-1); |
||
779 | $this->pos -= 2; |
||
780 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||
781 | $this->link_nodes($node, false); |
||
782 | return true; |
||
783 | } |
||
784 | |||
785 | if ($name!=='/' && $name!=='') { |
||
786 | $space[1] = $this->copy_skip($this->token_blank); |
||
787 | $name = $this->restore_noise($name); |
||
788 | if ($this->lowercase) $name = strtolower($name); |
||
789 | if ($this->char==='=') { |
||
790 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||
791 | $this->parse_attr($node, $name, $space); |
||
792 | } |
||
793 | else {
|
||
794 | //no value attr: nowrap, checked selected...
|
||
795 | $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO; |
||
796 | $node->attr[$name] = true; |
||
797 | if ($this->char!='>') $this->char = $this->doc[--$this->pos]; // prev |
||
798 | } |
||
799 | $node->_[HDOM_INFO_SPACE][] = $space; |
||
800 | $space = array($this->copy_skip($this->token_blank), '', ''); |
||
801 | } |
||
802 | else
|
||
803 | break;
|
||
804 | } while($this->char!=='>' && $this->char!=='/'); |
||
805 | |||
806 | $this->link_nodes($node, true); |
||
807 | $node->_[HDOM_INFO_ENDSPACE] = $space[0]; |
||
808 | |||
809 | // check self closing
|
||
810 | if ($this->copy_until_char_escape('>')==='/') { |
||
811 | $node->_[HDOM_INFO_ENDSPACE] .= '/'; |
||
812 | $node->_[HDOM_INFO_END] = 0; |
||
813 | } |
||
814 | else {
|
||
815 | // reset parent
|
||
816 | if (!isset($this->self_closing_tags[strtolower($node->tag)])) $this->parent = $node; |
||
817 | } |
||
818 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||
819 | return true; |
||
820 | } |
||
821 | |||
822 | // parse attributes
|
||
823 | protected function parse_attr($node, $name, &$space) { |
||
824 | $space[2] = $this->copy_skip($this->token_blank); |
||
825 | switch($this->char) { |
||
826 | case '"': |
||
827 | $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE; |
||
828 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||
829 | $node->attr[$name] = $this->restore_noise($this->copy_until_char_escape('"')); |
||
830 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||
831 | break;
|
||
832 | case '\'': |
||
833 | $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_SINGLE; |
||
834 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||
835 | $node->attr[$name] = $this->restore_noise($this->copy_until_char_escape('\'')); |
||
836 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||
837 | break;
|
||
838 | default:
|
||
839 | $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO; |
||
840 | $node->attr[$name] = $this->restore_noise($this->copy_until($this->token_attr)); |
||
841 | } |
||
842 | } |
||
843 | |||
844 | // link node's parent
|
||
845 | protected function link_nodes(&$node, $is_child) { |
||
846 | $node->parent = $this->parent; |
||
847 | $this->parent->nodes[] = $node; |
||
848 | if ($is_child) |
||
849 | $this->parent->children[] = $node; |
||
850 | } |
||
851 | |||
852 | // as a text node
|
||
853 | protected function as_text_node($tag) { |
||
854 | $node = new simple_html_dom_node($this); |
||
855 | ++$this->cursor;
|
||
856 | $node->_[HDOM_INFO_TEXT] = '</' . $tag . '>'; |
||
857 | $this->link_nodes($node, false); |
||
858 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||
859 | return true; |
||
860 | } |
||
861 | |||
862 | protected function skip($chars) { |
||
863 | $this->pos += strspn($this->doc, $chars, $this->pos); |
||
864 | $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||
865 | } |
||
866 | |||
867 | protected function copy_skip($chars) { |
||
868 | $pos = $this->pos; |
||
869 | $len = strspn($this->doc, $chars, $pos); |
||
870 | $this->pos += $len; |
||
871 | $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||
872 | if ($len===0) return ''; |
||
873 | return substr($this->doc, $pos, $len); |
||
874 | } |
||
875 | |||
876 | protected function copy_until($chars) { |
||
877 | $pos = $this->pos; |
||
878 | $len = strcspn($this->doc, $chars, $pos); |
||
879 | $this->pos += $len; |
||
880 | $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||
881 | return substr($this->doc, $pos, $len); |
||
882 | } |
||
883 | |||
884 | protected function copy_until_char($char) { |
||
885 | if ($this->char===null) return ''; |
||
886 | |||
887 | if (($pos = strpos($this->doc, $char, $this->pos))===false) { |
||
888 | $ret = substr($this->doc, $this->pos, $this->size-$this->pos); |
||
889 | $this->char = null; |
||
890 | $this->pos = $this->size; |
||
891 | return $ret; |
||
892 | } |
||
893 | |||
894 | if ($pos===$this->pos) return ''; |
||
895 | $pos_old = $this->pos; |
||
896 | $this->char = $this->doc[$pos]; |
||
897 | $this->pos = $pos; |
||
898 | return substr($this->doc, $pos_old, $pos-$pos_old); |
||
899 | } |
||
900 | |||
901 | protected function copy_until_char_escape($char) { |
||
902 | if ($this->char===null) return ''; |
||
903 | |||
904 | $start = $this->pos; |
||
905 | while(1) { |
||
906 | if (($pos = strpos($this->doc, $char, $start))===false) { |
||
907 | $ret = substr($this->doc, $this->pos, $this->size-$this->pos); |
||
908 | $this->char = null; |
||
909 | $this->pos = $this->size; |
||
910 | return $ret; |
||
911 | } |
||
912 | |||
913 | if ($pos===$this->pos) return ''; |
||
914 | |||
915 | if ($this->doc[$pos-1]==='\\') { |
||
916 | $start = $pos+1; |
||
917 | continue;
|
||
918 | } |
||
919 | |||
920 | $pos_old = $this->pos; |
||
921 | $this->char = $this->doc[$pos]; |
||
922 | $this->pos = $pos; |
||
923 | return substr($this->doc, $pos_old, $pos-$pos_old); |
||
924 | } |
||
925 | } |
||
926 | |||
927 | // remove noise from html content
|
||
928 | protected function remove_noise($pattern, $remove_tag=false) { |
||
929 | $count = preg_match_all($pattern, $this->doc, $matches, PREG_SET_ORDER|PREG_OFFSET_CAPTURE); |
||
930 | |||
931 | for ($i=$count-1; $i>-1; --$i) { |
||
932 | $key = '___noise___'.sprintf('% 3d', count($this->noise)+100); |
||
933 | $idx = ($remove_tag) ? 0 : 1; |
||
934 | $this->noise[$key] = $matches[$i][$idx][0]; |
||
935 | $this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0])); |
||
936 | } |
||
937 | |||
938 | // reset the length of content
|
||
939 | $this->size = strlen($this->doc); |
||
940 | if ($this->size>0) $this->char = $this->doc[0]; |
||
941 | } |
||
942 | |||
943 | // restore noise to html content
|
||
944 | function restore_noise($text) { |
||
945 | while(($pos=strpos($text, '___noise___'))!==false) { |
||
946 | $key = '___noise___'.$text[$pos+11].$text[$pos+12].$text[$pos+13]; |
||
947 | if (isset($this->noise[$key])) |
||
948 | $text = substr($text, 0, $pos).$this->noise[$key].substr($text, $pos+14); |
||
949 | } |
||
950 | return $text; |
||
951 | } |
||
952 | |||
953 | function __toString() { |
||
954 | return $this->root->innertext(); |
||
955 | } |
||
956 | |||
957 | function __get($name) { |
||
958 | switch($name) { |
||
959 | case 'outertext': return $this->root->innertext(); |
||
960 | case 'innertext': return $this->root->innertext(); |
||
961 | case 'plaintext': return $this->root->text(); |
||
962 | } |
||
963 | } |
||
964 | |||
965 | // camel naming conventions
|
||
966 | function childNodes($idx=-1) {return $this->root->childNodes($idx);} |
||
967 | function firstChild() {return $this->root->first_child();} |
||
968 | function lastChild() {return $this->root->last_child();} |
||
969 | function getElementById($id) {return $this->find("#$id", 0);} |
||
970 | function getElementsById($id, $idx=null) {return $this->find("#$id", $idx);} |
||
971 | function getElementByTagName($name) {return $this->find($name, 0);} |
||
972 | function getElementsByTagName($name, $idx=-1) {return $this->find($name, $idx);} |
||
973 | function loadFile() {$args = func_get_args();$this->load(call_user_func_array('file_get_contents', $args), true);} |
||
974 | } |
||
975 | ?> |