init
This commit is contained in:
commit
72a26edcff
22092 changed files with 2101903 additions and 0 deletions
44
lib/sd/example/scraping/example_scraping_digg.php
Normal file
44
lib/sd/example/scraping/example_scraping_digg.php
Normal file
|
|
@ -0,0 +1,44 @@
|
|||
<?php
|
||||
include_once('../../simple_html_dom.php');
|
||||
|
||||
function scraping_digg() {
|
||||
// create HTML DOM
|
||||
$html = file_get_html('http://digg.com/');
|
||||
|
||||
// get news block
|
||||
foreach($html->find('div.news-summary') as $article) {
|
||||
// get title
|
||||
$item['title'] = trim($article->find('h3', 0)->plaintext);
|
||||
// get details
|
||||
$item['details'] = trim($article->find('p', 0)->plaintext);
|
||||
// get intro
|
||||
$item['diggs'] = trim($article->find('li a strong', 0)->plaintext);
|
||||
|
||||
$ret[] = $item;
|
||||
}
|
||||
|
||||
// clean up memory
|
||||
$html->clear();
|
||||
unset($html);
|
||||
|
||||
return $ret;
|
||||
}
|
||||
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// test it!
|
||||
|
||||
// "http://digg.com" will check user_agent header...
|
||||
ini_set('user_agent', 'My-Application/2.5');
|
||||
|
||||
$ret = scraping_digg();
|
||||
|
||||
foreach($ret as $v) {
|
||||
echo $v['title'].'<br>';
|
||||
echo '<ul>';
|
||||
echo '<li>'.$v['details'].'</li>';
|
||||
echo '<li>Diggs: '.$v['diggs'].'</li>';
|
||||
echo '</ul>';
|
||||
}
|
||||
|
||||
?>
|
||||
59
lib/sd/example/scraping/example_scraping_general.php
Normal file
59
lib/sd/example/scraping/example_scraping_general.php
Normal file
|
|
@ -0,0 +1,59 @@
|
|||
<?php
|
||||
include_once('simple_html_dom.php');
|
||||
|
||||
function scraping_generic($url, $search) {
|
||||
// Didn't find it yet.
|
||||
$return = false;
|
||||
|
||||
echo "reading the url: " . $url . "<br/>";
|
||||
// create HTML DOM
|
||||
$html = file_get_html($url);
|
||||
echo "url has been read.<br/>";
|
||||
|
||||
// get article block
|
||||
foreach($html->find($search) as $found) {
|
||||
// Found at least one.
|
||||
$return - true;
|
||||
echo "found a: " . $search . "<br/><pre>";
|
||||
$found->dump();
|
||||
echo "</pre><br/>";
|
||||
}
|
||||
|
||||
// clean up memory
|
||||
$html->clear();
|
||||
unset($html);
|
||||
|
||||
return $return;
|
||||
}
|
||||
|
||||
|
||||
// ------------------------------------------
|
||||
error_log ("post:" . print_r($_POST, true));
|
||||
$url = "";
|
||||
if (isset($_POST['url']))
|
||||
{
|
||||
$url = $_POST['url'];
|
||||
}
|
||||
$search = "";
|
||||
if (isset($_POST['search']))
|
||||
{
|
||||
$search = $_POST['search'];
|
||||
}
|
||||
?>
|
||||
<form method="post">
|
||||
URL: <input name="url" type="text" value="<?=$url;?>"/><br/>
|
||||
Search: <input name="search" type="text" value="<?=$search;?>"/>
|
||||
<input name="submit" type="submit" value="Submit"/>
|
||||
</form>
|
||||
<?php
|
||||
// -----------------------------------------------------------------------------
|
||||
// test it!
|
||||
if (isset ($_POST['submit']))
|
||||
{
|
||||
$response = scraping_generic($_POST['url'], $_POST['search']);
|
||||
if (!$response)
|
||||
{
|
||||
echo "Did not find any: " . $_POST['search'] . "<br />";
|
||||
}
|
||||
}
|
||||
?>
|
||||
51
lib/sd/example/scraping/example_scraping_imdb.php
Normal file
51
lib/sd/example/scraping/example_scraping_imdb.php
Normal file
|
|
@ -0,0 +1,51 @@
|
|||
<?php
|
||||
include_once('../../simple_html_dom.php');
|
||||
|
||||
function scraping_IMDB($url) {
|
||||
// create HTML DOM
|
||||
$html = file_get_html($url);
|
||||
|
||||
// get title
|
||||
$ret['Title'] = $html->find('title', 0)->innertext;
|
||||
|
||||
// get rating
|
||||
$ret['Rating'] = $html->find('div[class="general rating"] b', 0)->innertext;
|
||||
|
||||
// get overview
|
||||
foreach($html->find('div[class="info"]') as $div) {
|
||||
// skip user comments
|
||||
if($div->find('h5', 0)->innertext=='User Comments:')
|
||||
return $ret;
|
||||
|
||||
$key = '';
|
||||
$val = '';
|
||||
|
||||
foreach($div->find('*') as $node) {
|
||||
if ($node->tag=='h5')
|
||||
$key = $node->plaintext;
|
||||
|
||||
if ($node->tag=='a' && $node->plaintext!='more')
|
||||
$val .= trim(str_replace("\n", '', $node->plaintext));
|
||||
|
||||
if ($node->tag=='text')
|
||||
$val .= trim(str_replace("\n", '', $node->plaintext));
|
||||
}
|
||||
|
||||
$ret[$key] = $val;
|
||||
}
|
||||
|
||||
// clean up memory
|
||||
$html->clear();
|
||||
unset($html);
|
||||
|
||||
return $ret;
|
||||
}
|
||||
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// test it!
|
||||
$ret = scraping_IMDB('http://imdb.com/title/tt0335266/');
|
||||
|
||||
foreach($ret as $k=>$v)
|
||||
echo '<strong>'.$k.' </strong>'.$v.'<br>';
|
||||
?>
|
||||
35
lib/sd/example/scraping/example_scraping_slashdot.php
Normal file
35
lib/sd/example/scraping/example_scraping_slashdot.php
Normal file
|
|
@ -0,0 +1,35 @@
|
|||
<?php
|
||||
include_once('../../simple_html_dom.php');
|
||||
|
||||
function scraping_slashdot() {
|
||||
// create HTML DOM
|
||||
$html = file_get_html('http://slashdot.org/');
|
||||
|
||||
// get article block
|
||||
foreach($html->find('div[id^=firehose-]') as $article) {
|
||||
// get title
|
||||
$item['title'] = trim($article->find('a.datitle', 0)->plaintext);
|
||||
// get body
|
||||
$item['body'] = trim($article->find('div.body', 0)->plaintext);
|
||||
|
||||
$ret[] = $item;
|
||||
}
|
||||
|
||||
// clean up memory
|
||||
$html->clear();
|
||||
unset($html);
|
||||
|
||||
return $ret;
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// test it!
|
||||
$ret = scraping_slashdot();
|
||||
|
||||
foreach($ret as $v) {
|
||||
echo $v['title'].'<br>';
|
||||
echo '<ul>';
|
||||
echo '<li>'.$v['body'].'</li>';
|
||||
echo '</ul>';
|
||||
}
|
||||
?>
|
||||
Loading…
Add table
Add a link
Reference in a new issue