This commit is contained in:
steven 2025-08-11 22:23:30 +02:00
commit 72a26edcff
22092 changed files with 2101903 additions and 0 deletions

View file

@ -0,0 +1,54 @@
<?php
// example of how to use advanced selector features
include('../simple_html_dom.php');
// -----------------------------------------------------------------------------
// descendant selector
$str = <<<HTML
<div>
<div>
<div class="foo bar">ok</div>
</div>
</div>
HTML;
$html = str_get_html($str);
echo $html->find('div div div', 0)->innertext . '<br>'; // result: "ok"
// -----------------------------------------------------------------------------
// nested selector
$str = <<<HTML
<ul id="ul1">
<li>item:<span>1</span></li>
<li>item:<span>2</span></li>
</ul>
<ul id="ul2">
<li>item:<span>3</span></li>
<li>item:<span>4</span></li>
</ul>
HTML;
$html = str_get_html($str);
foreach($html->find('ul') as $ul) {
foreach($ul->find('li') as $li)
echo $li->innertext . '<br>';
}
// -----------------------------------------------------------------------------
// parsing checkbox
$str = <<<HTML
<form name="form1" method="post" action="">
<input type="checkbox" name="checkbox1" value="checkbox1" checked>item1<br>
<input type="checkbox" name="checkbox2" value="checkbox2">item2<br>
<input type="checkbox" name="checkbox3" value="checkbox3" checked>item3<br>
</form>
HTML;
$html = str_get_html($str);
foreach($html->find('input[type=checkbox]') as $checkbox) {
if ($checkbox->checked)
echo $checkbox->name . ' is checked<br>';
else
echo $checkbox->name . ' is not checked<br>';
}
?>

View file

@ -0,0 +1,37 @@
<?php
// example of how to use basic selector to retrieve HTML contents
include('../simple_html_dom.php');
// get DOM from URL or file
$html = file_get_html('http://www.google.com/');
// find all link
foreach($html->find('a') as $e)
echo $e->href . '<br>';
// find all image
foreach($html->find('img') as $e)
echo $e->src . '<br>';
// find all image with full tag
foreach($html->find('img') as $e)
echo $e->outertext . '<br>';
// find all div tags with id=gbar
foreach($html->find('div#gbar') as $e)
echo $e->innertext . '<br>';
// find all span tags with class=gb1
foreach($html->find('span.gb1') as $e)
echo $e->outertext . '<br>';
// find all td tags with attribite align=center
foreach($html->find('td[align=center]') as $e)
echo $e->innertext . '<br>';
// extract text from table
echo $html->find('td[align="center"]', 1)->plaintext.'<br><hr>';
// extract text from HTML
echo $html->plaintext;
?>

View file

@ -0,0 +1,28 @@
<?php
include_once('../simple_html_dom.php');
// 1. Write a function with parameter "$element"
function my_callback($element) {
if ($element->tag=='input')
$element->outertext = 'input';
if ($element->tag=='img')
$element->outertext = 'img';
if ($element->tag=='a')
$element->outertext = 'a';
}
// 2. create HTML Dom
$html = file_get_html('http://www.google.com/');
// 3. Register the callback function with it's function name
$html->set_callback('my_callback');
// 4. Callback function will be invoked while dumping
echo $html;
?>

View file

@ -0,0 +1,5 @@
<?php
include_once('../simple_html_dom.php');
echo file_get_html('http://www.google.com/')->plaintext;
?>

View file

@ -0,0 +1,18 @@
<?php
// example of how to modify HTML contents
include('../simple_html_dom.php');
// get DOM from URL or file
$html = file_get_html('http://www.google.com/');
// remove all image
foreach($html->find('img') as $e)
$e->outertext = '';
// replace all input
foreach($html->find('input') as $e)
$e->outertext = '[INPUT]';
// dump contents
echo $html;
?>

View file

@ -0,0 +1,44 @@
<?php
include_once('../../simple_html_dom.php');
function scraping_digg() {
// create HTML DOM
$html = file_get_html('http://digg.com/');
// get news block
foreach($html->find('div.news-summary') as $article) {
// get title
$item['title'] = trim($article->find('h3', 0)->plaintext);
// get details
$item['details'] = trim($article->find('p', 0)->plaintext);
// get intro
$item['diggs'] = trim($article->find('li a strong', 0)->plaintext);
$ret[] = $item;
}
// clean up memory
$html->clear();
unset($html);
return $ret;
}
// -----------------------------------------------------------------------------
// test it!
// "http://digg.com" will check user_agent header...
ini_set('user_agent', 'My-Application/2.5');
$ret = scraping_digg();
foreach($ret as $v) {
echo $v['title'].'<br>';
echo '<ul>';
echo '<li>'.$v['details'].'</li>';
echo '<li>Diggs: '.$v['diggs'].'</li>';
echo '</ul>';
}
?>

View file

@ -0,0 +1,59 @@
<?php
include_once('simple_html_dom.php');
function scraping_generic($url, $search) {
// Didn't find it yet.
$return = false;
echo "reading the url: " . $url . "<br/>";
// create HTML DOM
$html = file_get_html($url);
echo "url has been read.<br/>";
// get article block
foreach($html->find($search) as $found) {
// Found at least one.
$return - true;
echo "found a: " . $search . "<br/><pre>";
$found->dump();
echo "</pre><br/>";
}
// clean up memory
$html->clear();
unset($html);
return $return;
}
// ------------------------------------------
error_log ("post:" . print_r($_POST, true));
$url = "";
if (isset($_POST['url']))
{
$url = $_POST['url'];
}
$search = "";
if (isset($_POST['search']))
{
$search = $_POST['search'];
}
?>
<form method="post">
URL: <input name="url" type="text" value="<?=$url;?>"/><br/>
Search: <input name="search" type="text" value="<?=$search;?>"/>
<input name="submit" type="submit" value="Submit"/>
</form>
<?php
// -----------------------------------------------------------------------------
// test it!
if (isset ($_POST['submit']))
{
$response = scraping_generic($_POST['url'], $_POST['search']);
if (!$response)
{
echo "Did not find any: " . $_POST['search'] . "<br />";
}
}
?>

View file

@ -0,0 +1,51 @@
<?php
include_once('../../simple_html_dom.php');
function scraping_IMDB($url) {
// create HTML DOM
$html = file_get_html($url);
// get title
$ret['Title'] = $html->find('title', 0)->innertext;
// get rating
$ret['Rating'] = $html->find('div[class="general rating"] b', 0)->innertext;
// get overview
foreach($html->find('div[class="info"]') as $div) {
// skip user comments
if($div->find('h5', 0)->innertext=='User Comments:')
return $ret;
$key = '';
$val = '';
foreach($div->find('*') as $node) {
if ($node->tag=='h5')
$key = $node->plaintext;
if ($node->tag=='a' && $node->plaintext!='more')
$val .= trim(str_replace("\n", '', $node->plaintext));
if ($node->tag=='text')
$val .= trim(str_replace("\n", '', $node->plaintext));
}
$ret[$key] = $val;
}
// clean up memory
$html->clear();
unset($html);
return $ret;
}
// -----------------------------------------------------------------------------
// test it!
$ret = scraping_IMDB('http://imdb.com/title/tt0335266/');
foreach($ret as $k=>$v)
echo '<strong>'.$k.' </strong>'.$v.'<br>';
?>

View file

@ -0,0 +1,35 @@
<?php
include_once('../../simple_html_dom.php');
function scraping_slashdot() {
// create HTML DOM
$html = file_get_html('http://slashdot.org/');
// get article block
foreach($html->find('div[id^=firehose-]') as $article) {
// get title
$item['title'] = trim($article->find('a.datitle', 0)->plaintext);
// get body
$item['body'] = trim($article->find('div.body', 0)->plaintext);
$ret[] = $item;
}
// clean up memory
$html->clear();
unset($html);
return $ret;
}
// -----------------------------------------------------------------------------
// test it!
$ret = scraping_slashdot();
foreach($ret as $v) {
echo $v['title'].'<br>';
echo '<ul>';
echo '<li>'.$v['body'].'</li>';
echo '</ul>';
}
?>

View file

@ -0,0 +1,35 @@
<?php
include_once('../simple_html_dom.php');
// -----------------------------------------------------------------------------
// remove HTML comments
function html_no_comment($url) {
// create HTML DOM
$html = file_get_html($url);
// remove all comment elements
foreach($html->find('comment') as $e)
$e->outertext = '';
$ret = $html->save();
// clean up memory
$html->clear();
unset($html);
return $ret;
}
// -----------------------------------------------------------------------------
// search elements that contains an specific text
function find_contains($html, $selector, $keyword, $index=-1) {
$ret = array();
foreach ($html->find($selector) as $e) {
if (strpos($e->innertext, $keyword)!==false)
$ret[] = $e;
}
if ($index<0) return $ret;
return (isset($ret[$index])) ? $ret[$index] : null;
}
?>