init
This commit is contained in:
commit
72a26edcff
22092 changed files with 2101903 additions and 0 deletions
54
lib/sd/example/example_advanced_selector.php
Normal file
54
lib/sd/example/example_advanced_selector.php
Normal file
|
|
@ -0,0 +1,54 @@
|
|||
<?php
|
||||
// example of how to use advanced selector features
|
||||
include('../simple_html_dom.php');
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// descendant selector
|
||||
$str = <<<HTML
|
||||
<div>
|
||||
<div>
|
||||
<div class="foo bar">ok</div>
|
||||
</div>
|
||||
</div>
|
||||
HTML;
|
||||
|
||||
$html = str_get_html($str);
|
||||
echo $html->find('div div div', 0)->innertext . '<br>'; // result: "ok"
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// nested selector
|
||||
$str = <<<HTML
|
||||
<ul id="ul1">
|
||||
<li>item:<span>1</span></li>
|
||||
<li>item:<span>2</span></li>
|
||||
</ul>
|
||||
<ul id="ul2">
|
||||
<li>item:<span>3</span></li>
|
||||
<li>item:<span>4</span></li>
|
||||
</ul>
|
||||
HTML;
|
||||
|
||||
$html = str_get_html($str);
|
||||
foreach($html->find('ul') as $ul) {
|
||||
foreach($ul->find('li') as $li)
|
||||
echo $li->innertext . '<br>';
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// parsing checkbox
|
||||
$str = <<<HTML
|
||||
<form name="form1" method="post" action="">
|
||||
<input type="checkbox" name="checkbox1" value="checkbox1" checked>item1<br>
|
||||
<input type="checkbox" name="checkbox2" value="checkbox2">item2<br>
|
||||
<input type="checkbox" name="checkbox3" value="checkbox3" checked>item3<br>
|
||||
</form>
|
||||
HTML;
|
||||
|
||||
$html = str_get_html($str);
|
||||
foreach($html->find('input[type=checkbox]') as $checkbox) {
|
||||
if ($checkbox->checked)
|
||||
echo $checkbox->name . ' is checked<br>';
|
||||
else
|
||||
echo $checkbox->name . ' is not checked<br>';
|
||||
}
|
||||
?>
|
||||
37
lib/sd/example/example_basic_selector.php
Normal file
37
lib/sd/example/example_basic_selector.php
Normal file
|
|
@ -0,0 +1,37 @@
|
|||
<?php
|
||||
// example of how to use basic selector to retrieve HTML contents
|
||||
include('../simple_html_dom.php');
|
||||
|
||||
// get DOM from URL or file
|
||||
$html = file_get_html('http://www.google.com/');
|
||||
|
||||
// find all link
|
||||
foreach($html->find('a') as $e)
|
||||
echo $e->href . '<br>';
|
||||
|
||||
// find all image
|
||||
foreach($html->find('img') as $e)
|
||||
echo $e->src . '<br>';
|
||||
|
||||
// find all image with full tag
|
||||
foreach($html->find('img') as $e)
|
||||
echo $e->outertext . '<br>';
|
||||
|
||||
// find all div tags with id=gbar
|
||||
foreach($html->find('div#gbar') as $e)
|
||||
echo $e->innertext . '<br>';
|
||||
|
||||
// find all span tags with class=gb1
|
||||
foreach($html->find('span.gb1') as $e)
|
||||
echo $e->outertext . '<br>';
|
||||
|
||||
// find all td tags with attribite align=center
|
||||
foreach($html->find('td[align=center]') as $e)
|
||||
echo $e->innertext . '<br>';
|
||||
|
||||
// extract text from table
|
||||
echo $html->find('td[align="center"]', 1)->plaintext.'<br><hr>';
|
||||
|
||||
// extract text from HTML
|
||||
echo $html->plaintext;
|
||||
?>
|
||||
28
lib/sd/example/example_callback.php
Normal file
28
lib/sd/example/example_callback.php
Normal file
|
|
@ -0,0 +1,28 @@
|
|||
<?php
|
||||
include_once('../simple_html_dom.php');
|
||||
|
||||
|
||||
// 1. Write a function with parameter "$element"
|
||||
function my_callback($element) {
|
||||
if ($element->tag=='input')
|
||||
$element->outertext = 'input';
|
||||
|
||||
if ($element->tag=='img')
|
||||
$element->outertext = 'img';
|
||||
|
||||
if ($element->tag=='a')
|
||||
$element->outertext = 'a';
|
||||
}
|
||||
|
||||
|
||||
// 2. create HTML Dom
|
||||
$html = file_get_html('http://www.google.com/');
|
||||
|
||||
|
||||
// 3. Register the callback function with it's function name
|
||||
$html->set_callback('my_callback');
|
||||
|
||||
|
||||
// 4. Callback function will be invoked while dumping
|
||||
echo $html;
|
||||
?>
|
||||
5
lib/sd/example/example_extract_html.php
Normal file
5
lib/sd/example/example_extract_html.php
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
<?php
|
||||
include_once('../simple_html_dom.php');
|
||||
|
||||
echo file_get_html('http://www.google.com/')->plaintext;
|
||||
?>
|
||||
18
lib/sd/example/example_modify_contents.php
Normal file
18
lib/sd/example/example_modify_contents.php
Normal file
|
|
@ -0,0 +1,18 @@
|
|||
<?php
|
||||
// example of how to modify HTML contents
|
||||
include('../simple_html_dom.php');
|
||||
|
||||
// get DOM from URL or file
|
||||
$html = file_get_html('http://www.google.com/');
|
||||
|
||||
// remove all image
|
||||
foreach($html->find('img') as $e)
|
||||
$e->outertext = '';
|
||||
|
||||
// replace all input
|
||||
foreach($html->find('input') as $e)
|
||||
$e->outertext = '[INPUT]';
|
||||
|
||||
// dump contents
|
||||
echo $html;
|
||||
?>
|
||||
44
lib/sd/example/scraping/example_scraping_digg.php
Normal file
44
lib/sd/example/scraping/example_scraping_digg.php
Normal file
|
|
@ -0,0 +1,44 @@
|
|||
<?php
|
||||
include_once('../../simple_html_dom.php');
|
||||
|
||||
function scraping_digg() {
|
||||
// create HTML DOM
|
||||
$html = file_get_html('http://digg.com/');
|
||||
|
||||
// get news block
|
||||
foreach($html->find('div.news-summary') as $article) {
|
||||
// get title
|
||||
$item['title'] = trim($article->find('h3', 0)->plaintext);
|
||||
// get details
|
||||
$item['details'] = trim($article->find('p', 0)->plaintext);
|
||||
// get intro
|
||||
$item['diggs'] = trim($article->find('li a strong', 0)->plaintext);
|
||||
|
||||
$ret[] = $item;
|
||||
}
|
||||
|
||||
// clean up memory
|
||||
$html->clear();
|
||||
unset($html);
|
||||
|
||||
return $ret;
|
||||
}
|
||||
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// test it!
|
||||
|
||||
// "http://digg.com" will check user_agent header...
|
||||
ini_set('user_agent', 'My-Application/2.5');
|
||||
|
||||
$ret = scraping_digg();
|
||||
|
||||
foreach($ret as $v) {
|
||||
echo $v['title'].'<br>';
|
||||
echo '<ul>';
|
||||
echo '<li>'.$v['details'].'</li>';
|
||||
echo '<li>Diggs: '.$v['diggs'].'</li>';
|
||||
echo '</ul>';
|
||||
}
|
||||
|
||||
?>
|
||||
59
lib/sd/example/scraping/example_scraping_general.php
Normal file
59
lib/sd/example/scraping/example_scraping_general.php
Normal file
|
|
@ -0,0 +1,59 @@
|
|||
<?php
|
||||
include_once('simple_html_dom.php');
|
||||
|
||||
function scraping_generic($url, $search) {
|
||||
// Didn't find it yet.
|
||||
$return = false;
|
||||
|
||||
echo "reading the url: " . $url . "<br/>";
|
||||
// create HTML DOM
|
||||
$html = file_get_html($url);
|
||||
echo "url has been read.<br/>";
|
||||
|
||||
// get article block
|
||||
foreach($html->find($search) as $found) {
|
||||
// Found at least one.
|
||||
$return - true;
|
||||
echo "found a: " . $search . "<br/><pre>";
|
||||
$found->dump();
|
||||
echo "</pre><br/>";
|
||||
}
|
||||
|
||||
// clean up memory
|
||||
$html->clear();
|
||||
unset($html);
|
||||
|
||||
return $return;
|
||||
}
|
||||
|
||||
|
||||
// ------------------------------------------
|
||||
error_log ("post:" . print_r($_POST, true));
|
||||
$url = "";
|
||||
if (isset($_POST['url']))
|
||||
{
|
||||
$url = $_POST['url'];
|
||||
}
|
||||
$search = "";
|
||||
if (isset($_POST['search']))
|
||||
{
|
||||
$search = $_POST['search'];
|
||||
}
|
||||
?>
|
||||
<form method="post">
|
||||
URL: <input name="url" type="text" value="<?=$url;?>"/><br/>
|
||||
Search: <input name="search" type="text" value="<?=$search;?>"/>
|
||||
<input name="submit" type="submit" value="Submit"/>
|
||||
</form>
|
||||
<?php
|
||||
// -----------------------------------------------------------------------------
|
||||
// test it!
|
||||
if (isset ($_POST['submit']))
|
||||
{
|
||||
$response = scraping_generic($_POST['url'], $_POST['search']);
|
||||
if (!$response)
|
||||
{
|
||||
echo "Did not find any: " . $_POST['search'] . "<br />";
|
||||
}
|
||||
}
|
||||
?>
|
||||
51
lib/sd/example/scraping/example_scraping_imdb.php
Normal file
51
lib/sd/example/scraping/example_scraping_imdb.php
Normal file
|
|
@ -0,0 +1,51 @@
|
|||
<?php
|
||||
include_once('../../simple_html_dom.php');
|
||||
|
||||
function scraping_IMDB($url) {
|
||||
// create HTML DOM
|
||||
$html = file_get_html($url);
|
||||
|
||||
// get title
|
||||
$ret['Title'] = $html->find('title', 0)->innertext;
|
||||
|
||||
// get rating
|
||||
$ret['Rating'] = $html->find('div[class="general rating"] b', 0)->innertext;
|
||||
|
||||
// get overview
|
||||
foreach($html->find('div[class="info"]') as $div) {
|
||||
// skip user comments
|
||||
if($div->find('h5', 0)->innertext=='User Comments:')
|
||||
return $ret;
|
||||
|
||||
$key = '';
|
||||
$val = '';
|
||||
|
||||
foreach($div->find('*') as $node) {
|
||||
if ($node->tag=='h5')
|
||||
$key = $node->plaintext;
|
||||
|
||||
if ($node->tag=='a' && $node->plaintext!='more')
|
||||
$val .= trim(str_replace("\n", '', $node->plaintext));
|
||||
|
||||
if ($node->tag=='text')
|
||||
$val .= trim(str_replace("\n", '', $node->plaintext));
|
||||
}
|
||||
|
||||
$ret[$key] = $val;
|
||||
}
|
||||
|
||||
// clean up memory
|
||||
$html->clear();
|
||||
unset($html);
|
||||
|
||||
return $ret;
|
||||
}
|
||||
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// test it!
|
||||
$ret = scraping_IMDB('http://imdb.com/title/tt0335266/');
|
||||
|
||||
foreach($ret as $k=>$v)
|
||||
echo '<strong>'.$k.' </strong>'.$v.'<br>';
|
||||
?>
|
||||
35
lib/sd/example/scraping/example_scraping_slashdot.php
Normal file
35
lib/sd/example/scraping/example_scraping_slashdot.php
Normal file
|
|
@ -0,0 +1,35 @@
|
|||
<?php
|
||||
include_once('../../simple_html_dom.php');
|
||||
|
||||
function scraping_slashdot() {
|
||||
// create HTML DOM
|
||||
$html = file_get_html('http://slashdot.org/');
|
||||
|
||||
// get article block
|
||||
foreach($html->find('div[id^=firehose-]') as $article) {
|
||||
// get title
|
||||
$item['title'] = trim($article->find('a.datitle', 0)->plaintext);
|
||||
// get body
|
||||
$item['body'] = trim($article->find('div.body', 0)->plaintext);
|
||||
|
||||
$ret[] = $item;
|
||||
}
|
||||
|
||||
// clean up memory
|
||||
$html->clear();
|
||||
unset($html);
|
||||
|
||||
return $ret;
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// test it!
|
||||
$ret = scraping_slashdot();
|
||||
|
||||
foreach($ret as $v) {
|
||||
echo $v['title'].'<br>';
|
||||
echo '<ul>';
|
||||
echo '<li>'.$v['body'].'</li>';
|
||||
echo '</ul>';
|
||||
}
|
||||
?>
|
||||
35
lib/sd/example/simple_html_dom_utility.php
Normal file
35
lib/sd/example/simple_html_dom_utility.php
Normal file
|
|
@ -0,0 +1,35 @@
|
|||
<?php
|
||||
include_once('../simple_html_dom.php');
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// remove HTML comments
|
||||
function html_no_comment($url) {
|
||||
// create HTML DOM
|
||||
$html = file_get_html($url);
|
||||
|
||||
// remove all comment elements
|
||||
foreach($html->find('comment') as $e)
|
||||
$e->outertext = '';
|
||||
|
||||
$ret = $html->save();
|
||||
|
||||
// clean up memory
|
||||
$html->clear();
|
||||
unset($html);
|
||||
|
||||
return $ret;
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// search elements that contains an specific text
|
||||
function find_contains($html, $selector, $keyword, $index=-1) {
|
||||
$ret = array();
|
||||
foreach ($html->find($selector) as $e) {
|
||||
if (strpos($e->innertext, $keyword)!==false)
|
||||
$ret[] = $e;
|
||||
}
|
||||
|
||||
if ($index<0) return $ret;
|
||||
return (isset($ret[$index])) ? $ret[$index] : null;
|
||||
}
|
||||
?>
|
||||
Loading…
Add table
Add a link
Reference in a new issue