This commit is contained in:
steven 2025-08-11 22:23:30 +02:00
commit 72a26edcff
22092 changed files with 2101903 additions and 0 deletions

View file

@ -0,0 +1,44 @@
<?php
include_once('../../simple_html_dom.php');
function scraping_digg() {
// create HTML DOM
$html = file_get_html('http://digg.com/');
// get news block
foreach($html->find('div.news-summary') as $article) {
// get title
$item['title'] = trim($article->find('h3', 0)->plaintext);
// get details
$item['details'] = trim($article->find('p', 0)->plaintext);
// get intro
$item['diggs'] = trim($article->find('li a strong', 0)->plaintext);
$ret[] = $item;
}
// clean up memory
$html->clear();
unset($html);
return $ret;
}
// -----------------------------------------------------------------------------
// test it!
// "http://digg.com" will check user_agent header...
ini_set('user_agent', 'My-Application/2.5');
$ret = scraping_digg();
foreach($ret as $v) {
echo $v['title'].'<br>';
echo '<ul>';
echo '<li>'.$v['details'].'</li>';
echo '<li>Diggs: '.$v['diggs'].'</li>';
echo '</ul>';
}
?>

View file

@ -0,0 +1,59 @@
<?php
include_once('simple_html_dom.php');
function scraping_generic($url, $search) {
// Didn't find it yet.
$return = false;
echo "reading the url: " . $url . "<br/>";
// create HTML DOM
$html = file_get_html($url);
echo "url has been read.<br/>";
// get article block
foreach($html->find($search) as $found) {
// Found at least one.
$return - true;
echo "found a: " . $search . "<br/><pre>";
$found->dump();
echo "</pre><br/>";
}
// clean up memory
$html->clear();
unset($html);
return $return;
}
// ------------------------------------------
error_log ("post:" . print_r($_POST, true));
$url = "";
if (isset($_POST['url']))
{
$url = $_POST['url'];
}
$search = "";
if (isset($_POST['search']))
{
$search = $_POST['search'];
}
?>
<form method="post">
URL: <input name="url" type="text" value="<?=$url;?>"/><br/>
Search: <input name="search" type="text" value="<?=$search;?>"/>
<input name="submit" type="submit" value="Submit"/>
</form>
<?php
// -----------------------------------------------------------------------------
// test it!
if (isset ($_POST['submit']))
{
$response = scraping_generic($_POST['url'], $_POST['search']);
if (!$response)
{
echo "Did not find any: " . $_POST['search'] . "<br />";
}
}
?>

View file

@ -0,0 +1,51 @@
<?php
include_once('../../simple_html_dom.php');
function scraping_IMDB($url) {
// create HTML DOM
$html = file_get_html($url);
// get title
$ret['Title'] = $html->find('title', 0)->innertext;
// get rating
$ret['Rating'] = $html->find('div[class="general rating"] b', 0)->innertext;
// get overview
foreach($html->find('div[class="info"]') as $div) {
// skip user comments
if($div->find('h5', 0)->innertext=='User Comments:')
return $ret;
$key = '';
$val = '';
foreach($div->find('*') as $node) {
if ($node->tag=='h5')
$key = $node->plaintext;
if ($node->tag=='a' && $node->plaintext!='more')
$val .= trim(str_replace("\n", '', $node->plaintext));
if ($node->tag=='text')
$val .= trim(str_replace("\n", '', $node->plaintext));
}
$ret[$key] = $val;
}
// clean up memory
$html->clear();
unset($html);
return $ret;
}
// -----------------------------------------------------------------------------
// test it!
$ret = scraping_IMDB('http://imdb.com/title/tt0335266/');
foreach($ret as $k=>$v)
echo '<strong>'.$k.' </strong>'.$v.'<br>';
?>

View file

@ -0,0 +1,35 @@
<?php
include_once('../../simple_html_dom.php');
function scraping_slashdot() {
// create HTML DOM
$html = file_get_html('http://slashdot.org/');
// get article block
foreach($html->find('div[id^=firehose-]') as $article) {
// get title
$item['title'] = trim($article->find('a.datitle', 0)->plaintext);
// get body
$item['body'] = trim($article->find('div.body', 0)->plaintext);
$ret[] = $item;
}
// clean up memory
$html->clear();
unset($html);
return $ret;
}
// -----------------------------------------------------------------------------
// test it!
$ret = scraping_slashdot();
foreach($ret as $v) {
echo $v['title'].'<br>';
echo '<ul>';
echo '<li>'.$v['body'].'</li>';
echo '</ul>';
}
?>