init

2025-08-11 22:23:30 +02:00 · 2025-08-11 22:23:30 +02:00 · 72a26edcff
commit 72a26edcff
22092 changed files with 2101903 additions and 0 deletions
--- a/lib/sd/example/scraping/example_scraping_digg.php
+++ b/lib/sd/example/scraping/example_scraping_digg.php
@ -0,0 +1,44 @@
+<?php
+include_once('../../simple_html_dom.php');
+
+function scraping_digg() {
+    // create HTML DOM
+    $html = file_get_html('http://digg.com/');
+
+    // get news block
+    foreach($html->find('div.news-summary') as $article) {
+        // get title
+        $item['title'] = trim($article->find('h3', 0)->plaintext);
+        // get details
+        $item['details'] = trim($article->find('p', 0)->plaintext);
+        // get intro
+        $item['diggs'] = trim($article->find('li a strong', 0)->plaintext);
+
+        $ret[] = $item;
+    }
+    
+    // clean up memory
+    $html->clear();
+    unset($html);
+
+    return $ret;
+}
+
+
+// -----------------------------------------------------------------------------
+// test it!
+
+// "http://digg.com" will check user_agent header...
+ini_set('user_agent', 'My-Application/2.5');
+
+$ret = scraping_digg();
+
+foreach($ret as $v) {
+    echo $v['title'].'<br>';
+    echo '<ul>';
+    echo '<li>'.$v['details'].'</li>';
+    echo '<li>Diggs: '.$v['diggs'].'</li>';
+    echo '</ul>';
+}
+
+?>
--- a/lib/sd/example/scraping/example_scraping_general.php
+++ b/lib/sd/example/scraping/example_scraping_general.php
@ -0,0 +1,59 @@
+<?php
+include_once('simple_html_dom.php');
+
+function scraping_generic($url, $search) {
+	// Didn't find it yet.
+	$return = false;
+
+	echo "reading the url: " . $url . "<br/>";
+    // create HTML DOM
+    $html = file_get_html($url);
+	echo "url has been read.<br/>";
+
+    // get article block
+    foreach($html->find($search) as $found) {
+		// Found at least one.
+		$return - true;
+		echo "found a: " . $search . "<br/><pre>";
+		$found->dump();
+		echo "</pre><br/>";
+    }
+    
+    // clean up memory
+    $html->clear();
+    unset($html);
+
+    return $return;
+}
+
+
+// ------------------------------------------
+error_log ("post:" . print_r($_POST, true));
+$url = "";
+if (isset($_POST['url']))
+{
+	$url = $_POST['url'];
+}
+$search = "";
+if (isset($_POST['search']))
+{
+	$search = $_POST['search'];
+}
+?>
+<form method="post">
+	URL: <input name="url" type="text" value="<?=$url;?>"/><br/>
+	Search: <input name="search" type="text" value="<?=$search;?>"/>
+	<input name="submit" type="submit" value="Submit"/>
+</form>
+<?php
+// -----------------------------------------------------------------------------
+// test it!
+if (isset ($_POST['submit']))
+{
+	$response = scraping_generic($_POST['url'], $_POST['search']);
+	if (!$response)
+	{
+		echo "Did not find any: " . $_POST['search'] . "<br />";
+	}
+}
+?>
--- a/lib/sd/example/scraping/example_scraping_imdb.php
+++ b/lib/sd/example/scraping/example_scraping_imdb.php
@ -0,0 +1,51 @@
+<?php
+include_once('../../simple_html_dom.php');
+
+function scraping_IMDB($url) {
+    // create HTML DOM
+    $html = file_get_html($url);
+
+    // get title
+    $ret['Title'] = $html->find('title', 0)->innertext;
+
+    // get rating
+    $ret['Rating'] = $html->find('div[class="general rating"] b', 0)->innertext;
+
+    // get overview
+    foreach($html->find('div[class="info"]') as $div) {
+        // skip user comments
+        if($div->find('h5', 0)->innertext=='User Comments:')
+            return $ret;
+
+        $key = '';
+        $val = '';
+
+        foreach($div->find('*') as $node) {
+            if ($node->tag=='h5')
+                $key = $node->plaintext;
+
+            if ($node->tag=='a' && $node->plaintext!='more')
+                $val .= trim(str_replace("\n", '', $node->plaintext));
+
+            if ($node->tag=='text')
+                $val .= trim(str_replace("\n", '', $node->plaintext));
+        }
+
+        $ret[$key] = $val;
+    }
+    
+    // clean up memory
+    $html->clear();
+    unset($html);
+
+    return $ret;
+}
+
+
+// -----------------------------------------------------------------------------
+// test it!
+$ret = scraping_IMDB('http://imdb.com/title/tt0335266/');
+
+foreach($ret as $k=>$v)
+    echo '<strong>'.$k.' </strong>'.$v.'<br>';
+?>
--- a/lib/sd/example/scraping/example_scraping_slashdot.php
+++ b/lib/sd/example/scraping/example_scraping_slashdot.php
@ -0,0 +1,35 @@
+<?php
+include_once('../../simple_html_dom.php');
+
+function scraping_slashdot() {
+    // create HTML DOM
+    $html = file_get_html('http://slashdot.org/');
+
+    // get article block
+    foreach($html->find('div[id^=firehose-]') as $article) {
+        // get title
+        $item['title'] = trim($article->find('a.datitle', 0)->plaintext);
+        // get body
+        $item['body'] = trim($article->find('div.body', 0)->plaintext);
+
+        $ret[] = $item;
+    }
+    
+    // clean up memory
+    $html->clear();
+    unset($html);
+
+    return $ret;
+}
+
+// -----------------------------------------------------------------------------
+// test it!
+$ret = scraping_slashdot();
+
+foreach($ret as $v) {
+    echo $v['title'].'<br>';
+    echo '<ul>';
+    echo '<li>'.$v['body'].'</li>';
+    echo '</ul>';
+}
+?>