This commit is contained in:
steven 2025-08-11 22:23:30 +02:00
commit 72a26edcff
22092 changed files with 2101903 additions and 0 deletions

View file

@ -0,0 +1,43 @@
<?php
namespace Spekulatius\PHPScraper\Tests;
/**
* This tests only the `<base href="...">`-extraction.
*
* If you are looking for any URL-related tests check `UrlTest.php`.
*/
class BaseHrefTest extends \PHPUnit\Framework\TestCase
{
/**
* @test
*/
public function testMissingBaseHref()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/meta/missing.html');
// Check the baseHref as not given (null)
$this->assertNull($web->baseHref);
}
/**
* @test
*/
public function testBaseHref()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
// Contains: <base href="https://test-pages-with-base-href.phpscraper.de/">
$web->go('https://test-pages.phpscraper.de/meta/image/absolute-path-with-base-href.html');
// Check the baseHref
$this->assertSame(
'https://test-pages-with-base-href.phpscraper.de/',
$web->baseHref
);
}
}

View file

@ -0,0 +1,38 @@
<?php
namespace Spekulatius\PHPScraper\Tests;
class CanonicalTest extends \PHPUnit\Framework\TestCase
{
/**
* @test
*/
public function testMissingCanonical()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Go to the test page
$web->go('https://test-pages.phpscraper.de/meta/missing.html');
// null if there isn't a canonical set.
$this->assertNull($web->canonical);
}
/**
* @test
*/
public function testWithCanonical()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
// It contains: <link rel="canonical" href="https://test-pages.phpscraper.de/navigation/2.html" />
$web->go('https://test-pages.phpscraper.de/navigation/1.html');
// Check the canonical
$this->assertSame(
'https://test-pages.phpscraper.de/navigation/2.html',
$web->canonical
);
}
}

93
lib/sc/tests/CoreTest.php Normal file
View file

@ -0,0 +1,93 @@
<?php
namespace Spekulatius\PHPScraper\Tests;
class CoreTest extends \PHPUnit\Framework\TestCase
{
/**
* @test
*/
public function testMethodAndPropertyCallsAreEqual()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to test page
$web->go('https://phpscraper.de');
// Both the method call as well as property call should return the same...
$this->assertSame($web->title, $web->title());
}
/**
* Test if our local variable is updated correctly.
*
* @test
*/
public function testChangeOfCurrentPage()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// 1. Navigate to test page
$web->go('https://test-pages.phpscraper.de/meta/lorem-ipsum.html');
// Both the method call as well as property call should return the same...
$this->assertSame(
'https://test-pages.phpscraper.de/meta/lorem-ipsum.html',
$web->currentUrl
);
$this->assertSame(
'Lorem Ipsum',
$web->title
);
// 2. Leave the current page and head on to the next one.
$web->go('https://phpscraper.de');
// We should have navigated.
$this->assertSame(
'https://phpscraper.de',
$web->currentUrl
);
// Shouldn't match, because we surfed on...
$this->assertNotSame(
'https://test-pages.phpscraper.de/meta/lorem-ipsum.html',
$web->currentUrl
);
$this->assertNotSame(
'Lorem Ipsum',
$web->title
);
}
/**
* Calls should be chainable and easy to access.
*
* @test
*/
public function testBasicChainability()
{
// Testing env: First h1: "We are testing here & elsewhere!"
$url = 'https://test-pages.phpscraper.de/meta/html-entities.html';
// Test 1: Create, navigate to the test page.
$web = new \Spekulatius\PHPScraper\PHPScraper;
$web->go($url);
// Check the h1
$this->assertSame(
'We are testing here & elsewhere!',
$web->h1[0]
);
// Test 2: Chained
$this->assertSame(
'We are testing here & elsewhere!',
// Chained
(new \Spekulatius\PHPScraper\PHPScraper)
->go($url)
->h1[0]
);
}
}

View file

@ -0,0 +1,104 @@
<?php
namespace Spekulatius\PHPScraper\Tests;
class CustomSelectorTest extends \PHPUnit\Framework\TestCase
{
/**
* @test
*/
public function testFailedSelectionBasedOnId()
{
// Navigate to test page
$web = new \Spekulatius\PHPScraper\PHPScraper;
$web->go('https://test-pages.phpscraper.de/content/selectors.html');
// Ensure we got the test page.
$this->assertSame(
'Selector Tests',
$web->title
);
// Trigger failing test.
try {
$web->filterFirstText("//[@id='by-id']");
} catch (\Exception $e) {
$this->assertSame(
'DOMXPath::query(): Invalid expression',
$e->getMessage()
);
}
}
/**
* @test
*/
public function testSelectionBasedOnId()
{
// Navigate to test page
$web = new \Spekulatius\PHPScraper\PHPScraper;
$web->go('https://test-pages.phpscraper.de/content/selectors.html');
// Ensure we got the test page.
$this->assertSame(
'Selector Tests',
$web->title
);
// Select content using `->text()`
$this->assertSame(
'Content by ID',
$web->filterFirstText("//*[@id='by-id']")
);
}
/**
* @test
*/
public function testSelectionBasedOnTag()
{
// Navigate to test page
$web = new \Spekulatius\PHPScraper\PHPScraper;
$web->go('https://test-pages.phpscraper.de/content/selectors.html');
// Ensure we got the test page.
$this->assertSame(
'Selector Tests',
$web->title
);
// Select single string using first and chain `->text()`
$this->assertSame(
'Selector Tests (h1)',
$web->filterFirst('//h1')->text()
);
// Select as array using `filterTexts`:
$this->assertSame(
['Selector Tests (h1)'],
$web->filterTexts('//h1')
);
}
/**
* @test
*/
public function testSelectionBasedOnClass()
{
// Navigate to test page
$web = new \Spekulatius\PHPScraper\PHPScraper;
$web->go('https://test-pages.phpscraper.de/content/selectors.html');
// Ensure we got the test page.
$this->assertSame(
'Selector Tests',
$web->title
);
// Select without `->text()` and using the filterTexts-method instead.
$this->assertSame(
['Content by Class 1', 'Content by Class 2'],
$web->filterTexts("//*[@class='by-class']")
);
}
}

View file

@ -0,0 +1,71 @@
<?php
namespace Spekulatius\PHPScraper\Tests;
class DownloadTest extends \PHPUnit\Framework\TestCase
{
/**
* @test
*/
public function testMissingDownload()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
$this->expectException(\Symfony\Component\HttpClient\Exception\ClientException::class);
$this->expectExceptionMessage('HTTP/2 404 returned for "https://phpscraper.de/broken-url"');
$web->fetchAsset('https://phpscraper.de/broken-url');
}
/**
* @test
*/
public function testDownload()
{
// Downloads the PHPScraper sitemap and ensures the homepage is included (valid download and output).
$web = new \Spekulatius\PHPScraper\PHPScraper;
$xmlString = $web->fetchAsset('https://phpscraper.de/sitemap.xml');
// Convert XML to array
// Credit: https://stackoverflow.com/a/20431742
$xml = simplexml_load_string($xmlString, 'SimpleXMLElement', LIBXML_NOCDATA);
$array = json_decode((string) json_encode($xml), true);
$urls = array_map(
fn ($url) => $url['loc'],
$array['url']
);
$this->assertContains(
'https://phpscraper.de/',
$urls
);
}
/**
* We should support both absolute and relative URLs.
*
* Here we use the sitemap test page as a reference.
*
* @test
*/
public function testDifferentUrlTypes()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page. As the URL is predefined, it's only about the base URL.
$web->go('https://test-pages.phpscraper.de/meta/feeds.html');
// Test 1: Absolute URL
$this->assertSame(
$web->fetchAsset($web->sitemapUrl),
$web->fetchAsset($web->currentBaseHost . '/custom_sitemap.xml'),
);
// Test 2: Relative URL
$this->assertSame(
$web->fetchAsset($web->sitemapUrl),
$web->fetchAsset('/custom_sitemap.xml'),
);
}
}

View file

@ -0,0 +1,172 @@
<?php
namespace Spekulatius\PHPScraper\Tests;
use Spekulatius\PHPScraper\DataTransferObjects\FeedEntry;
class FeedRssTest extends \PHPUnit\Framework\TestCase
{
/**
* @test
*/
public function testMissingRssUrls()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/meta/missing.html');
// This page shouldn't contain any RSS feeds.
$this->assertEmpty($web->rssUrls);
}
/**
* @test
*/
public function testRssUrls()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/meta/feeds.html');
// Did we get the expected result? Any URLs should be made absolute.
$this->assertSame([
'https://test-pages.phpscraper.de/absolute.xml',
'https://test-pages.phpscraper.de/relative.xml',
], $web->rssUrls);
}
/**
* Tests if we can use a custom url instead of a identified one.
*
* @test
*/
public function testCustomRssUrl()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/meta/feeds.html');
// We should always allow to use a custom url.
// Both files are the same.
// One URL isn't linked from the feeds.html and therefore is custom.
$this->assertSame(
$web->rssRaw('https://test-pages.phpscraper.de/custom_rss.xml'),
$web->rssRaw('https://test-pages.phpscraper.de/relative.xml')
);
}
/**
* We should support both absolute and relative URLs.
*
* @test
*/
public function testDifferentRssUrlTypes()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page. As the URL is predefined, it's only about the base URL.
$web->go('https://test-pages.phpscraper.de/meta/feeds.html');
// Test 1: Absolute URL
$this->assertSame(
$web->rssRaw($web->rssUrls[0]),
$web->rssRaw($web->currentBaseHost . '/custom_rss.xml'),
);
// Test 2: Relative URL
$this->assertSame(
$web->rssRaw($web->rssUrls[0]),
$web->rssRaw('/custom_rss.xml'),
);
}
/**
* Tests the raw parsing.
*
* @test
*/
public function testRssRawContent()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/meta/feeds.html');
// The raw RSS is rather unhandy to work with. Let's put it in a var before testing stuff.
$rssRaw = $web->rssRaw('https://test-pages.phpscraper.de/custom_rss.xml')[0]['entry'];
// Ensure the structure is an nested array
$this->assertTrue(is_array($rssRaw));
$this->assertTrue(is_array($rssRaw[4]));
// Check some entries to ensure the parsing works.
$this->assertSame(
$rssRaw[4]['link']['@attributes']['href'],
'https://peterthaleikis.com/posts/how-i-built-my-first-browser-extension/'
);
$this->assertSame(
$rssRaw[2]['link']['@attributes']['href'],
'https://peterthaleikis.com/posts/how-to-use-pug-on-netlify/'
);
$this->assertSame(
$rssRaw[0]['link']['@attributes']['href'],
'https://peterthaleikis.com/posts/startup-name-check:-experiences-of-the-first-week/'
);
}
/**
* Tests the DTO creation.
*
* @test
*/
public function testRss()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/meta/feeds.html');
// The raw RSS is rather unhandy to work with (hence we actually use the DTOs).
$rss = $web->rss('https://test-pages.phpscraper.de/custom_rss.xml');
// Check the count
$this->assertSame(37, count($rss));
// Check some entries to ensure the parsing works.
// Set 1
$this->assertTrue($rss[4] instanceof FeedEntry);
$this->assertSame(
$rss[4]->title,
'How I Built My First Browser Extension'
);
$this->assertSame(
$rss[4]->link,
'https://peterthaleikis.com/posts/how-i-built-my-first-browser-extension/'
);
// Set 2
$this->assertTrue($rss[2] instanceof FeedEntry);
$this->assertSame(
$rss[2]->title,
'How to Use Pug on Netlify?'
);
$this->assertSame(
$rss[2]->link,
'https://peterthaleikis.com/posts/how-to-use-pug-on-netlify/'
);
// Set 3
$this->assertTrue($rss[0] instanceof FeedEntry);
$this->assertSame(
$rss[0]->title,
'Startup Name Check: Experiences of the First week'
);
$this->assertSame(
$rss[0]->link,
'https://peterthaleikis.com/posts/startup-name-check:-experiences-of-the-first-week/'
);
}
}

View file

@ -0,0 +1,192 @@
<?php
namespace Spekulatius\PHPScraper\Tests;
use Spekulatius\PHPScraper\DataTransferObjects\FeedEntry;
class FeedSearchIndexTest extends \PHPUnit\Framework\TestCase
{
/**
* @test
*/
public function testSearchIndexUrl()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page. As the URL is predefined, it's only about the base URL.
$web->go('https://test-pages.phpscraper.de/meta/feeds.html');
// Did we get the expected `/index.json`?
$this->assertSame(
'https://test-pages.phpscraper.de/index.json',
$web->searchIndexUrl
);
}
/**
* Tests if the default search index path is applied.
*
* @test
*/
public function testDefaultSearchIndexUrl()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page. As the URL is predefined, it's only about the base URL.
$web->go('https://test-pages.phpscraper.de/meta/feeds.html');
// `searchIndexUrl` should be the default.
$this->assertSame(
$web->searchIndexRaw(),
$web->searchIndexRaw($web->searchIndexUrl),
);
}
/**
* The `custom_index.json` and `index.json` are the same.
*
* So we compare the two results to ensure the custom URL feature works.
*
* @test
*/
public function testCustomSearchIndexUrl()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page. As the URL is predefined, it's only about the base URL.
$web->go('https://test-pages.phpscraper.de/meta/feeds.html');
// We should always allow for custom urls.
$this->assertSame(
$web->searchIndexRaw($web->searchIndexUrl),
$web->searchIndexRaw($web->currentBaseHost . '/custom_index.json'),
);
}
/**
* We should support both absolute and relative URLs.
*
* @test
*/
public function testDifferentSearchIndexUrlTypes()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page. As the URL is predefined, it's only about the base URL.
$web->go('https://test-pages.phpscraper.de/meta/feeds.html');
// Test 1: Absolute URL
$this->assertSame(
$web->searchIndexRaw($web->searchIndexUrl),
$web->searchIndexRaw($web->currentBaseHost . '/custom_index.json'),
);
// Test 2: Relative URL
$this->assertSame(
$web->searchIndexRaw($web->searchIndexUrl),
$web->searchIndexRaw('/custom_index.json'),
);
}
/**
* Tests the raw parsing.
*
* @test
*/
public function testSearchIndexRaw()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page. As the URL is predefined, it's only about the base URL.
$web->go('https://test-pages.phpscraper.de/meta/feeds.html');
// Get the raw searchIndex and store it.
$searchIndexRaw = $web->searchIndexRaw;
// Ensure the structure is an nested array
$this->assertTrue(is_array($searchIndexRaw));
$this->assertTrue(is_array($searchIndexRaw[42]));
// Did we get the expected `/index.json`? It should contain 60 entries.
$this->assertSame(60, count($searchIndexRaw));
// Check some data to ensure the parsing actually worked.
$this->assertSame(
'https://pastablelists.com/en/counties-of-croatia',
$searchIndexRaw[4]['link']
);
$this->assertSame(
'https://pastablelists.com/en/municipalities-of-macedonia',
$searchIndexRaw[2]['link']
);
$this->assertSame(
'https://pastablelists.com/en/counties-and-municipalities-of-lithuania',
$searchIndexRaw[0]['link']
);
}
/**
* Tests the DTO creation.
*
* @test
*/
public function testSearchIndex()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page. As the URL is predefined, it's only about the base URL.
$web->go('https://test-pages.phpscraper.de/meta/feeds.html');
// Get the searchIndex and store it.
$searchIndex = $web->searchIndex;
// Did we get the expected `/index.json`? It should contain 60 entries.
$this->assertSame(60, count($searchIndex));
// Check some data to ensure the parsing actually worked:
// Set 1
$this->assertTrue($searchIndex[4] instanceof FeedEntry);
$this->assertSame(
'List of the Counties of Croatia',
$searchIndex[4]->title,
);
$this->assertSame(
'List of the Counties of Croatia ready for copy and paste or export.',
$searchIndex[4]->description,
);
$this->assertSame(
'https://pastablelists.com/en/counties-of-croatia',
$searchIndex[4]->link,
);
// Set 2
$this->assertTrue($searchIndex[2] instanceof FeedEntry);
$this->assertSame(
'List of the Municipalities of Macedonia',
$searchIndex[2]->title,
);
$this->assertSame(
'List of the Municipalities of Macedonia ready for copy and paste or export.',
$searchIndex[2]->description,
);
$this->assertSame(
'https://pastablelists.com/en/municipalities-of-macedonia',
$searchIndex[2]->link,
);
// Set 3
$this->assertTrue($searchIndex[0] instanceof FeedEntry);
$this->assertSame(
'List of the Counties and Municipalities of Lithuania',
$searchIndex[0]->title,
);
$this->assertSame(
'List of the Counties and Municipalities of Lithuania, ready for copy and paste or export.',
$searchIndex[0]->description,
);
$this->assertSame(
'https://pastablelists.com/en/counties-and-municipalities-of-lithuania',
$searchIndex[0]->link,
);
}
}

View file

@ -0,0 +1,147 @@
<?php
namespace Spekulatius\PHPScraper\Tests;
use Spekulatius\PHPScraper\DataTransferObjects\FeedEntry;
class FeedSitemapTest extends \PHPUnit\Framework\TestCase
{
/**
* @test
*/
public function testSitemapUrl()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page. As the URL is guessed, it's only about the base URL.
$web->go('https://test-pages.phpscraper.de/meta/feeds.html');
// Did we get the expected `/sitemap.xml`?
$this->assertSame(
'https://test-pages.phpscraper.de/sitemap.xml',
$web->sitemapUrl
);
}
/**
* Tests if the default sitemap path is applied.
*
* @test
*/
public function testDefaultSitemapUrl()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page. As the URL is guessed, it's only about the base URL.
$web->go('https://test-pages.phpscraper.de/meta/feeds.html');
// The sitemapUrl should be the default.
$this->assertSame(
$web->sitemapRaw(),
$web->sitemapRaw($web->sitemapUrl),
);
}
/**
* The files `sitemap.xml` and `custom_sitemap.xml` are the same and used to ensure the custom URL feature works.
*
* @test
*/
public function testCustomSitemapUrl()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page. As the URL is guessed, it's only about the base URL.
$web->go('https://test-pages.phpscraper.de/meta/feeds.html');
// We should always allow for custom paths.
$this->assertSame(
$web->sitemapRaw($web->sitemapUrl),
$web->sitemapRaw($web->currentBaseHost . '/custom_sitemap.xml'),
);
}
/**
* We should support both absolute and relative URLs.
*
* @test
*/
public function testDifferentSitemapUrlTypes()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page. As the URL is predefined, it's only about the base URL.
$web->go('https://test-pages.phpscraper.de/meta/feeds.html');
// Test 1: Absolute URL
$this->assertSame(
$web->sitemapRaw($web->sitemapUrl),
$web->sitemapRaw($web->currentBaseHost . '/custom_sitemap.xml'),
);
// Test 2: Relative URL
$this->assertSame(
$web->sitemapRaw($web->sitemapUrl),
$web->sitemapRaw('/custom_sitemap.xml'),
);
}
/**
* Ensure we can parse the sitemap in itself (XML).
*
* @test
*/
public function testSitemapRaw()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page. As the URL is guessed, it's only about the base URL.
$web->go('https://test-pages.phpscraper.de/meta/feeds.html');
// Get the sitemap and store it.
$sitemapRaw = $web->sitemapRaw;
// Check the count
$this->assertSame(129, count($sitemapRaw['url']));
// Check some entries to ensure the parsing works as expected.
$this->assertSame(
'https://phpscraper.de/apis/linkedin.html',
$sitemapRaw['url'][4]['loc'],
);
$this->assertSame(
'https://phpscraper.de/de/apis/zalando.html',
$sitemapRaw['url'][20]['loc'],
);
}
/**
* Tests the DTO creation.
*
* @test
*/
public function testSitemap()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page. As the URL is guessed, it's only about the base URL.
$web->go('https://test-pages.phpscraper.de/meta/feeds.html');
// Get the sitemap and store it.
$sitemap = $web->sitemap;
// Check the count
$this->assertSame(129, count($sitemap));
// Check some samples.
$this->assertTrue($sitemap[42] instanceof FeedEntry);
$this->assertSame(
'https://phpscraper.de/apis/linkedin.html',
$sitemap[4]->link,
);
$this->assertSame(
'https://phpscraper.de/de/apis/zalando.html',
$sitemap[20]->link
);
}
}

View file

@ -0,0 +1,130 @@
<?php
namespace Spekulatius\PHPScraper\Tests;
class HeadingTest extends \PHPUnit\Framework\TestCase
{
/**
* @test
*/
public function testMissingHeadings()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/meta/no-meta.html');
// Check the missing headers (h1 actually exists on the page).
$this->assertSame([], $web->h2);
$this->assertSame([], $web->h3);
$this->assertSame([], $web->h4);
$this->assertSame([], $web->h5);
$this->assertSame([], $web->h6);
}
/**
* @test
*/
public function testWithHTMLEntity()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/meta/html-entities.html');
// Check the h1
$this->assertSame(
'We are testing here & elsewhere!',
$web->h1[0]
);
// h2s
$this->assertSame(2, count($web->h2));
$this->assertSame([
'Cat & Mouse',
'Mouse & Cat',
], $web->h2);
// Collection of headings
$this->assertSame(
[
['We are testing here & elsewhere!'],
['Cat & Mouse', 'Mouse & Cat'],
['1', '2', '3'],
['Not so important heading'],
[],
[],
],
$web->headings
);
}
/**
* @test
*/
public function testLoremIpsum()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/meta/lorem-ipsum.html');
// Check the h1
$this->assertSame(
'We are testing here!',
$web->h1[0]
);
// h2s
$this->assertSame(2, count($web->h2));
$this->assertSame([
'h2s are headings too.',
'h2s are headings too.',
], $web->h2);
}
/**
* @test
*/
public function testGermanUmlaute()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/meta/german-umlaute.html');
// Check the h1
$this->assertSame(
'We are testing here ä ü ö!',
$web->h1[0]
);
// h2s
$this->assertSame(2, count($web->h2));
$this->assertSame([
'Täst, ehm, test!',
'Weiter testen, Müller!',
], $web->h2);
}
/**
* @test
*/
public function testChineseCharacters()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/meta/chinese-characters.html');
// Check the h1
$this->assertSame(
'We are testing here! 加油!',
$web->h1[0]
);
// h2s
$this->assertSame(2, count($web->h2));
$this->assertSame(['加油!', '加油 #1!'], $web->h2);
}
}

243
lib/sc/tests/ImageTest.php Normal file
View file

@ -0,0 +1,243 @@
<?php
namespace Spekulatius\PHPScraper\Tests;
class ImageTest extends \PHPUnit\Framework\TestCase
{
/**
* @test
*/
public function testNoImages()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/meta/missing.html');
// No images -> an empty array is expected.
$this->assertSame([], $web->images);
$this->assertSame([], $web->imagesWithDetails);
}
/**
* @test
*/
public function testLoremIpsum()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/meta/lorem-ipsum.html');
// Navigate to the test page. This page contains two images (cat.jpg).
$this->assertSame(2, count($web->images));
// Check the simple list
$this->assertSame([
'https://test-pages.phpscraper.de/assets/cat.jpg',
'https://test-pages.phpscraper.de/assets/cat.jpg',
], $web->images);
// Check the expected data
$this->assertSame([
[
'url' => 'https://test-pages.phpscraper.de/assets/cat.jpg',
'alt' => 'absolute path',
'width' => null,
'height' => null,
],
[
'url' => 'https://test-pages.phpscraper.de/assets/cat.jpg',
'alt' => 'relative path',
'width' => null,
'height' => null,
],
], $web->imagesWithDetails);
}
/**
* @test
*/
public function testGermanUmlaute()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/meta/german-umlaute.html');
// Check the h1
$this->assertSame(
'We are testing here ä ü ö!',
$web->h1[0]
);
// Check the number of images
$this->assertSame(2, count($web->images));
// Check the simple list
$this->assertSame([
'https://test-pages.phpscraper.de/assets/katze-ä-ü-ö.jpg',
'https://test-pages.phpscraper.de/assets/katze-ä-ü-ö.jpg',
], $web->images);
// Check the expected data
$this->assertSame([
[
'url' => 'https://test-pages.phpscraper.de/assets/katze-ä-ü-ö.jpg',
'alt' => 'absolute path',
'width' => null,
'height' => null,
],
[
'url' => 'https://test-pages.phpscraper.de/assets/katze-ä-ü-ö.jpg',
'alt' => 'relative path',
'width' => null,
'height' => null,
],
], $web->imagesWithDetails);
}
/**
* @test
*/
public function testChineseCharacters()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/meta/chinese-characters.html');
// Check the number of images
$this->assertSame(2, count($web->images));
// Check the simple list
$this->assertSame([
'https://test-pages.phpscraper.de/assets/貓.jpg',
'https://test-pages.phpscraper.de/assets/貓.jpg',
], $web->images);
// Check the expected data
$this->assertSame([
[
'url' => 'https://test-pages.phpscraper.de/assets/貓.jpg',
'alt' => 'absolute path',
'width' => null,
'height' => null,
],
[
'url' => 'https://test-pages.phpscraper.de/assets/貓.jpg',
'alt' => 'relative path',
'width' => null,
'height' => null,
],
], $web->imagesWithDetails);
}
/**
* @test
*/
public function testBaseHref()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/images/base-href.html');
// Check the number of images
$this->assertSame(2, count($web->images));
// Base set:
$this->assertSame([
'https://test-pages.phpscraper.de/assets/cat.jpg',
'https://test-pages-with-base-href.phpscraper.de/assets/cat.jpg',
], $web->images);
// Detail set:
$this->assertSame([
[
'url' => 'https://test-pages.phpscraper.de/assets/cat.jpg',
'alt' => 'absolute path with base href',
'width' => null,
'height' => null,
],
[
'url' => 'https://test-pages-with-base-href.phpscraper.de/assets/cat.jpg',
'alt' => 'relative path with base href',
'width' => null,
'height' => null,
],
], $web->imagesWithDetails);
}
/**
* @test
*/
public function testWidth()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/images/width.html');
// Check the number of images
$this->assertSame(3, count($web->images));
// Check the expected data
$this->assertSame([
[
'url' => 'https://test-pages.phpscraper.de/assets/cat.jpg',
'alt' => 'no width',
'width' => null,
'height' => null,
],
[
'url' => 'https://test-pages.phpscraper.de/assets/cat.jpg',
'alt' => 'width at 1200px',
'width' => '1200px',
'height' => null,
],
[
'url' => 'https://test-pages.phpscraper.de/assets/cat.jpg',
'alt' => 'width at 100rem',
'width' => '100rem',
'height' => null,
],
], $web->imagesWithDetails);
}
/**
* @test
*/
public function testHeight()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/images/height.html');
// Check the number of imagess
$this->assertSame(3, count($web->images));
// Check the expected data
$this->assertSame([
[
'url' => 'https://test-pages.phpscraper.de/assets/cat.jpg',
'alt' => 'no height',
'width' => null,
'height' => null,
],
[
'url' => 'https://test-pages.phpscraper.de/assets/cat.jpg',
'alt' => 'height at 1200px',
'width' => null,
'height' => '1200px',
],
[
'url' => 'https://test-pages.phpscraper.de/assets/cat.jpg',
'alt' => 'height at 100rem',
'width' => null,
'height' => '100rem',
],
], $web->imagesWithDetails);
}
}

View file

@ -0,0 +1,91 @@
<?php
namespace Spekulatius\PHPScraper\Tests;
class KeywordTest extends \PHPUnit\Framework\TestCase
{
/**
* @test
*/
public function testKeywordExtraction()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
// It contains 3 paragraphs from the English Wikipedia article for "lorem ipsum"
$web->go('https://test-pages.phpscraper.de/content/keywords.html');
// Check the keywords on this case...
$keywords = $web->contentKeywords;
// A selected list of keywords to expect
$shouldKeywords = [
'1960s',
'added',
'adopted lorem ipsum',
'advertisements',
'aldus employed',
'corrupted version',
'graphic',
'improper latin',
'introduced',
'keyword extraction tests',
'test',
'microsoft word',
'english wikipedia',
'lorem ipsum',
'lorem ipsum text',
];
// Check if all are part of the output
foreach ($shouldKeywords as $keyword) {
$this->assertTrue(
in_array($keyword, $keywords),
sprintf('"%s" is missing', $keyword)
);
}
}
/**
* @test
*/
public function testKeywordExtractionWithScores()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
// It contains 3 paragraphs from the English Wikipedia article for "lorem ipsum"
$web->go('https://test-pages.phpscraper.de/content/keywords.html');
// Check the keywords on this case...
$keywords = $web->contentKeywordsWithScores;
// A selected list of keywords to expect
$shouldKeywords = [
'added' => 1.0,
'adopted lorem ipsum' => 11.0,
'advertisements' => 1.0,
'aldus employed' => 4.0,
'corrupted version' => 4.0,
'graphic' => 1.0,
'improper latin' => 4.0,
'introduced' => 1.0,
'keyword extraction tests' => 9.0,
'test' => 1.0,
'microsoft word' => 5.3333333333333,
'english wikipedia' => 4.0,
'lorem ipsum' => 8.0,
'lorem ipsum text' => 11.0,
];
// Check if all are part of the output with the expected score
foreach ($shouldKeywords as $keyword => $score) {
// Has the same score
$this->assertSame(
round($keywords[$keyword], 8),
round($score, 8),
sprintf('Score for "%s" is incorrect', $keyword)
);
}
}
}

406
lib/sc/tests/LinkTest.php Normal file
View file

@ -0,0 +1,406 @@
<?php
namespace Spekulatius\PHPScraper\Tests;
class LinkTest extends \PHPUnit\Framework\TestCase
{
/**
* @test
*/
public function testNoLinks()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/links/no-links.html');
// No links -> an empty array is expected.
$this->assertSame([], $web->links);
$this->assertSame([], $web->linksWithDetails);
}
/**
* @test
*/
public function testTarget()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/links/target.html');
// Check the number of links
$this->assertSame(6, count($web->links));
// Check the simple links list
$this->assertSame([
'https://placekitten.com/408/287',
'https://placekitten.com/444/333',
'https://placekitten.com/444/321',
'https://placekitten.com/408/287',
'https://placekitten.com/444/333',
'https://placekitten.com/444/321',
], $web->links);
// Check the complex links list
$this->assertSame([
[
'url' => 'https://placekitten.com/408/287',
'protocol' => 'https',
'text' => 'external kitten',
'title' => null,
'target' => '_blank',
'rel' => null,
'image' => [],
'isNofollow' => false,
'isUGC' => false,
'isSponsored' => false,
'isMe' => false,
'isNoopener' => false,
'isNoreferrer' => false,
], [
'url' => 'https://placekitten.com/444/333',
'protocol' => 'https',
'text' => 'external kitten',
'title' => null,
'target' => '_blank',
'rel' => null,
'image' => [],
'isNofollow' => false,
'isUGC' => false,
'isSponsored' => false,
'isMe' => false,
'isNoopener' => false,
'isNoreferrer' => false,
], [
'url' => 'https://placekitten.com/444/321',
'protocol' => 'https',
'text' => 'external kitten',
'title' => null,
'target' => '_blank',
'rel' => null,
'image' => [],
'isNofollow' => false,
'isUGC' => false,
'isSponsored' => false,
'isMe' => false,
'isNoopener' => false,
'isNoreferrer' => false,
], [
'url' => 'https://placekitten.com/408/287',
'protocol' => 'https',
'text' => 'external kitten',
'title' => null,
'target' => 'kitten',
'rel' => null,
'image' => [],
'isNofollow' => false,
'isUGC' => false,
'isSponsored' => false,
'isMe' => false,
'isNoopener' => false,
'isNoreferrer' => false,
], [
'url' => 'https://placekitten.com/444/333',
'protocol' => 'https',
'text' => 'external kitten',
'title' => null,
'target' => 'kitten',
'rel' => null,
'image' => [],
'isNofollow' => false,
'isUGC' => false,
'isSponsored' => false,
'isMe' => false,
'isNoopener' => false,
'isNoreferrer' => false,
], [
'url' => 'https://placekitten.com/444/321',
'protocol' => 'https',
'text' => 'external kitten',
'title' => null,
'target' => 'kitten',
'rel' => null,
'image' => [],
'isNofollow' => false,
'isUGC' => false,
'isSponsored' => false,
'isMe' => false,
'isNoopener' => false,
'isNoreferrer' => false,
],
], $web->linksWithDetails);
}
/**
* @test
*/
public function testRel()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
// This page contains several links with different rel attributes.
$web->go('https://test-pages.phpscraper.de/links/rel.html');
// Check the number of links
$this->assertSame(5, count($web->links));
// Check the simple links list
$this->assertSame([
'https://placekitten.com/432/287',
'https://placekitten.com/456/287',
'https://placekitten.com/345/287',
'https://placekitten.com/345/287',
'https://placekitten.com/345/222',
], $web->links);
// Check the complex links list
$this->assertSame([
[
'url' => 'https://placekitten.com/432/287',
'protocol' => 'https',
'text' => 'external kitten',
'title' => null,
'target' => null,
'rel' => 'nofollow',
'image' => [],
'isNofollow' => true,
'isUGC' => false,
'isSponsored' => false,
'isMe' => false,
'isNoopener' => false,
'isNoreferrer' => false,
], [
'url' => 'https://placekitten.com/456/287',
'protocol' => 'https',
'text' => 'external kitten',
'title' => null,
'target' => null,
'rel' => 'ugc',
'image' => [],
'isNofollow' => false,
'isUGC' => true,
'isSponsored' => false,
'isMe' => false,
'isNoopener' => false,
'isNoreferrer' => false,
], [
'url' => 'https://placekitten.com/345/287',
'protocol' => 'https',
'text' => 'external kitten',
'title' => null,
'target' => null,
'rel' => 'nofollow ugc',
'image' => [],
'isNofollow' => true,
'isUGC' => true,
'isSponsored' => false,
'isMe' => false,
'isNoopener' => false,
'isNoreferrer' => false,
], [
'url' => 'https://placekitten.com/345/287',
'protocol' => 'https',
'text' => 'external kitten',
'title' => null,
'target' => null,
'rel' => 'noopener',
'image' => [],
'isNofollow' => false,
'isUGC' => false,
'isSponsored' => false,
'isMe' => false,
'isNoopener' => true,
'isNoreferrer' => false,
], [
'url' => 'https://placekitten.com/345/222',
'protocol' => 'https',
'text' => 'external kitten',
'title' => null,
'target' => null,
'rel' => 'noreferrer',
'image' => [],
'isNofollow' => false,
'isUGC' => false,
'isSponsored' => false,
'isMe' => false,
'isNoopener' => false,
'isNoreferrer' => true,
],
], $web->linksWithDetails);
}
/**
* @test
*/
public function testBaseHref()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/links/base-href.html');
// Check the number of links
$this->assertSame(3, count($web->links));
// Check the simple links list
$this->assertSame([
'https://placekitten.com/408/287',
'https://test-pages.phpscraper.de/assets/cat.jpg',
'https://test-pages-with-base-href.phpscraper.de/assets/cat.jpg',
], $web->links);
// Check the complex links list
$this->assertSame([
[
'url' => 'https://placekitten.com/408/287',
'protocol' => 'https',
'text' => 'external kitten',
'title' => 'external path with base href',
'target' => null,
'rel' => null,
'image' => [],
'isNofollow' => false,
'isUGC' => false,
'isSponsored' => false,
'isMe' => false,
'isNoopener' => false,
'isNoreferrer' => false,
], [
'url' => 'https://test-pages.phpscraper.de/assets/cat.jpg',
'protocol' => 'https',
'text' => 'absolute path to cat',
'title' => 'absolute internal path with base href',
'target' => null,
'rel' => null,
'image' => [],
'isNofollow' => false,
'isUGC' => false,
'isSponsored' => false,
'isMe' => false,
'isNoopener' => false,
'isNoreferrer' => false,
], [
'url' => 'https://test-pages-with-base-href.phpscraper.de/assets/cat.jpg',
'protocol' => 'https',
'text' => 'relative cat',
'title' => 'relative path with base href',
'target' => null,
'rel' => null,
'image' => [],
'isNofollow' => false,
'isUGC' => false,
'isSponsored' => false,
'isMe' => false,
'isNoopener' => false,
'isNoreferrer' => false,
],
], $web->linksWithDetails);
}
/**
* @test
*/
public function testImageUrl()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/links/image-url.html');
// Check the number of links
$this->assertSame(3, count($web->links));
// Check the complex links list
$this->assertSame([
[
'url' => 'https://placekitten.com/432/500',
'protocol' => 'https',
'text' => '',
'title' => null,
'target' => null,
'rel' => 'nofollow',
'image' => [
'https://placekitten.com/432/287',
],
'isNofollow' => true,
'isUGC' => false,
'isSponsored' => false,
'isMe' => false,
'isNoopener' => false,
'isNoreferrer' => false,
], [
'url' => 'https://placekitten.com/456/500',
'protocol' => 'https',
'text' => '',
'title' => null,
'target' => null,
'rel' => 'ugc',
'image' => [
'https://placekitten.com/456/400',
'https://placekitten.com/456/300',
],
'isNofollow' => false,
'isUGC' => true,
'isSponsored' => false,
'isMe' => false,
'isNoopener' => false,
'isNoreferrer' => false,
], [
'url' => 'https://placekitten.com/345/500',
'protocol' => 'https',
'text' => 'This is image',
'title' => null,
'target' => null,
'rel' => 'nofollow ugc',
'image' => [
'https://placekitten.com/345/287',
],
'isNofollow' => true,
'isUGC' => true,
'isSponsored' => false,
'isMe' => false,
'isNoopener' => false,
'isNoreferrer' => false,
],
], $web->linksWithDetails);
}
/**
* @test
*/
public function testInternalLinks()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/links/base-href.html');
// Check the internal links list
$this->assertSame(
['https://test-pages.phpscraper.de/assets/cat.jpg'],
$web->internalLinks
);
}
/**
* @test
*/
public function testExternalLinks()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/links/base-href.html');
// Check the external links list
$this->assertSame(
[
'https://placekitten.com/408/287',
'https://test-pages-with-base-href.phpscraper.de/assets/cat.jpg',
],
$web->externalLinks
);
}
}

View file

@ -0,0 +1,51 @@
<?php
namespace Spekulatius\PHPScraper\Tests;
class ListsTest extends \PHPUnit\Framework\TestCase
{
/**
* @test
*/
public function checkCountTest()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
/**
* Navigate to the test page. This page contains:
*
* <h2>Example 1: Unordered List</h2>
* <ul>
* <li>Unordered item 1</li>
* <li>Unordered item 2</li>
* <li>Unordered item with <b>HTML</b></li>
* </ul>
*
* <h2>Example 2: Ordered List</h2>
* <ol>
* <li>Order list item 1</li>
* <li>Order list item 2</li>
* <li>Order list item with <i>HTML</i></li>
* </ol>
*/
$web->go('https://test-pages.phpscraper.de/content/lists.html');
// Check all lists are recognized
$this->assertSame(count($web->lists), 2);
$this->assertSame(count($web->unorderedLists), 1);
$this->assertSame(count($web->orderedLists), 1);
// Check the contents
$this->assertSame([
'Ordered list item 1',
'Ordered list item 2',
'Ordered list item with HTML',
], $web->orderedLists[0]['children_plain']);
$this->assertSame([
'Unordered list item 1',
'Unordered list item 2',
'Unordered list item with HTML',
], $web->unorderedLists[0]['children_plain']);
}
}

View file

@ -0,0 +1,88 @@
<?php
namespace Spekulatius\PHPScraper\Tests;
class MetaAuthorTest extends \PHPUnit\Framework\TestCase
{
/**
* @test
*/
public function testMissingAuthor()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/meta/meta/missing.html');
// Check the author as not given (null)
$this->assertNull($web->author);
}
/**
* @test
*/
public function testWithHTMLEntity()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/meta/html-entities.html');
// Check the author
$this->assertSame(
'Cat & Mouse',
$web->author
);
}
/**
* @test
*/
public function testLoremIpsum()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/meta/lorem-ipsum.html');
// Check the author
$this->assertSame(
'Lorem ipsum',
$web->author
);
}
/**
* @test
*/
public function testGermanUmlaute()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/meta/german-umlaute.html');
// Check the author
$this->assertSame(
'Müller',
$web->author
);
}
/**
* @test
*/
public function testChineseCharacters()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/meta/chinese-characters.html');
// Check the author
$this->assertSame(
'貓',
$web->author
);
}
}

View file

@ -0,0 +1,37 @@
<?php
namespace Spekulatius\PHPScraper\Tests;
class MetaCharsetTest extends \PHPUnit\Framework\TestCase
{
/**
* @test
*/
public function testMissingCharset()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/meta/missing.html');
// Check the charset as not given (null)
$this->assertNull($web->charset);
}
/**
* @test
*/
public function testWithCharset()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/meta/lorem-ipsum.html');
// Check the charset
$this->assertSame(
'utf-8',
$web->charset
);
}
}

View file

@ -0,0 +1,37 @@
<?php
namespace Spekulatius\PHPScraper\Tests;
class MetaContentTypeTest extends \PHPUnit\Framework\TestCase
{
/**
* @test
*/
public function testMissingContentType()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/meta/missing.html');
// Check the contentType as not given (null)
$this->assertNull($web->contentType);
}
/**
* @test
*/
public function testWithContentType()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/meta/lorem-ipsum.html');
// Check the contentType
$this->assertSame(
'text/html; charset=utf-8',
$web->contentType
);
}
}

View file

@ -0,0 +1,38 @@
<?php
namespace Spekulatius\PHPScraper\Tests;
class MetaCsrfTokenTest extends \PHPUnit\Framework\TestCase
{
/**
* @test
*/
public function testMissingCsrfToken()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/meta/missing.html');
// Check the csrfToken as not given (null)
$this->assertNull($web->csrfToken);
}
/**
* @test
*/
public function testWithCsrfToken()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
// Contains: <meta name="csrf-token" content="token" />
$web->go('https://test-pages.phpscraper.de/meta/lorem-ipsum.html');
// Check the csrfToken
$this->assertSame(
'token',
$web->csrfToken
);
}
}

View file

@ -0,0 +1,88 @@
<?php
namespace Spekulatius\PHPScraper\Tests;
class MetaDescriptionTest extends \PHPUnit\Framework\TestCase
{
/**
* @test
*/
public function testMissingDescription()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/meta/missing.html');
// Check the description as not given (null)
$this->assertNull($web->description);
}
/**
* @test
*/
public function testWithHTMLEntity()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/meta/html-entities.html');
// Check the description
$this->assertSame(
'Cat & Mouse',
$web->description
);
}
/**
* @test
*/
public function testLoremIpsum()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/meta/lorem-ipsum.html');
// Check the description
$this->assertSame(
'Lorem ipsum dolor etc.',
$web->description
);
}
/**
* @test
*/
public function testGermanUmlaute()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/meta/german-umlaute.html');
// Check the description
$this->assertSame(
'Eine deutsche Beschreibung mit Umlauten: ä ü ö',
$web->description
);
}
/**
* @test
*/
public function testChineseCharacters()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/meta/chinese-characters.html');
// Check the description
$this->assertSame(
'A description with Chinese Characters: 加油',
$web->description
);
}
}

View file

@ -0,0 +1,99 @@
<?php
namespace Spekulatius\PHPScraper\Tests;
class MetaImageTest extends \PHPUnit\Framework\TestCase
{
/**
* @test
*/
public function testCallMethodsAreEqual()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Attempt to my blog
$web->go('https://peterthaleikis.com');
// Both the method call as well as property call should return the same...
$this->assertSame($web->image(), $web->image);
}
/**
* @test
*/
public function testMissingImage()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/meta/missing.html');
// Check the absolute image path
$this->assertNull($web->image);
}
/**
* @test
*/
public function testAbsolutePath()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/meta/image/absolute-path.html');
// Check the absolute image path
$this->assertSame('https://test-pages.phpscraper.de/assets/cat.jpg', $web->image);
}
/**
* @test
*/
public function testRelativePath()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/meta/image/relative-path.html');
// Check the relative image path should be converted into an absolute path.
$this->assertSame(
'https://test-pages.phpscraper.de/assets/cat.jpg',
$web->image
);
}
/**
* @test
*/
public function testAbsolutePathWithBaseHref()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/meta/image/absolute-path-with-base-href.html');
// Check the absolute image path
$this->assertSame(
'https://test-pages.phpscraper.de/assets/cat.jpg',
$web->image
);
}
/**
* @test
*/
public function testRelativePathBaseHref()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/meta/image/relative-path-with-base-href.html');
// Check the relative image path
$this->assertSame(
'https://test-pages-with-base-href.phpscraper.de/assets/cat.jpg',
$web->image
);
}
}

View file

@ -0,0 +1,125 @@
<?php
namespace Spekulatius\PHPScraper\Tests;
class MetaKeywordsTest extends \PHPUnit\Framework\TestCase
{
/**
* @test
*/
public function testMissingKeywords()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Go to the test page
$web->go('https://test-pages.phpscraper.de/meta/missing.html');
// null if there aren't any keywords set.
$this->assertNull($web->keywordString);
// Empty array if there aren't any keywords set.
$this->assertTrue(is_iterable($web->keywords));
$this->assertTrue(empty($web->keywords));
}
/**
* @test
*/
public function testNoSpaces()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/meta/keywords/parse-no-spaces.html');
// Check the keywords on this case...
$this->assertSame('one,two,three', $web->keywordString);
$this->assertSame(['one', 'two', 'three'], $web->keywords);
}
/**
* @test
*/
public function testSpaces()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/meta/keywords/parse-spaces.html');
// Check the keywords on this case...
$this->assertSame('one, two, three', $web->keywordString);
$this->assertSame(['one', 'two', 'three'], $web->keywords);
}
/**
* @test
*/
public function testIrregularSpaces()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/meta/keywords/parse-irregular-spaces.html');
// Check the keywords on this case...
$this->assertSame('one, two, three', $web->keywordString);
$this->assertSame(['one', 'two', 'three'], $web->keywords);
}
/**
* @test
*/
public function testWithHTMLEntity()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/meta/html-entities.html');
// Check the keywords
$this->assertSame(['Cat & Mouse', 'Mouse & Cat'], $web->keywords);
}
/**
* @test
*/
public function testLoremIpsum()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/meta/lorem-ipsum.html');
// Check the keywords
$this->assertSame(['Lorem', 'ipsum', 'dolor'], $web->keywords);
}
/**
* @test
*/
public function testGermanUmlaute()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/meta/german-umlaute.html');
// Check the keywords
$this->assertSame(['keywords', 'schlüsselwörter'], $web->keywords);
}
/**
* @test
*/
public function testChineseCharacters()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/meta/chinese-characters.html');
// Check the keywords
$this->assertSame(['加油', '貓'], $web->keywords);
}
}

View file

@ -0,0 +1,45 @@
<?php
namespace Spekulatius\PHPScraper\Tests;
class MetaViewportTest extends \PHPUnit\Framework\TestCase
{
/**
* @test
*/
public function testMissingViewport()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Go to the test page
$web->go('https://test-pages.phpscraper.de/meta/missing.html');
// null if there isn't a viewport set.
$this->assertNull($web->viewportString);
// Empty array if there aren't any viewports set.
$this->assertTrue(is_iterable($web->viewport));
$this->assertTrue(empty($web->viewport));
}
/**
* @test
*/
public function testWithViewport()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/meta/lorem-ipsum.html');
// Check the viewport
$this->assertSame(
'width=device-width, initial-scale=1, shrink-to-fit=no, maximum-scale=1, user-scalable=no',
$web->viewportString
);
$this->assertSame(
['width=device-width', 'initial-scale=1', 'shrink-to-fit=no', 'maximum-scale=1', 'user-scalable=no'],
$web->viewport
);
}
}

View file

@ -0,0 +1,132 @@
<?php
namespace Spekulatius\PHPScraper\Tests;
class NavigationTest extends \PHPUnit\Framework\TestCase
{
/**
* @test
*/
public function testSurfWithAbsoluteLink()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to test page #1.
$web->go('https://test-pages.phpscraper.de/navigation/1.html');
// Check the title to see if we actually at the right page...
$this->assertSame('Page #1', $web->h1[0]);
// Navigate to test page #2 using the absolute link.
$web->clickLink('2 absolute');
// Check the title and URL to see if we actually moved...
$this->assertSame('Page #2', $web->h1[0]);
$this->assertSame($web->currentUrl, 'https://test-pages.phpscraper.de/navigation/2.html');
}
/**
* @test
*/
public function testSurfWithRelativeLink()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to test page #1.
$web->go('https://test-pages.phpscraper.de/navigation/1.html');
// Check the title to see if we actually at the right page...
$this->assertSame('Page #1', $web->h1[0]);
// Navigate to test page #2 using the relative link.
$web->clickLink('2 relative');
// Check the title and URL to see if we actually moved...
$this->assertSame('Page #2', $web->h1[0]);
$this->assertSame($web->currentUrl, 'https://test-pages.phpscraper.de/navigation/2.html');
}
/**
* Test navigation using an anchor text.
*
* @test
*/
public function testLeavePageByText()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to test page #2.
$web->go('https://test-pages.phpscraper.de/navigation/2.html');
// Check the title to see if we actually at the right page...
$this->assertSame('Page #2', $web->h1[0]);
// Click the link with the text:
$web->clickLink('external link');
// Check the URL
$this->assertSame('https://peterthaleikis.com/', $web->currentUrl);
}
/**
* Test if we can navigate out using a redirect.
*
* @test
*/
public function testLeavePageWithRedirect()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to test page #2.
$web->go('https://test-pages.phpscraper.de/navigation/2.html');
// Check the title to see if we actually at the right page...
$this->assertSame('Page #2', $web->h1[0]);
// Click the link with the text:
$web->clickLink('external link with redirect');
// Check the URL
$this->assertSame('https://peterthaleikis.com/', $web->currentUrl);
}
/**
* Test if we can navigate out.
*
* @test
*/
public function testLeavePageByURL()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to test page #2.
$web->go('https://test-pages.phpscraper.de/navigation/2.html');
// Check the title to see if we actually at the right page...
$this->assertSame('Page #2', $web->h1[0]);
// Click the link with the text:
$web->clickLink('https://peterthaleikis.com/');
// Check the URL
$this->assertSame('https://peterthaleikis.com/', $web->currentUrl);
}
/**
* Test chainability of `clickLink`.
*
* @test
*/
public function testClickLinkChainability()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to a page, click a link by URL and see if we are on the expected `currentUrl`.
$web
->go('https://test-pages.phpscraper.de/navigation/2.html')
->clickLink('https://peterthaleikis.com/');
// Check the URL
$this->assertSame('https://peterthaleikis.com/', $web->currentUrl);
}
}

View file

@ -0,0 +1,22 @@
<?php
namespace Spekulatius\PHPScraper\Tests;
use PHPUnit\Framework\TestCase;
class NotFoundTest extends TestCase
{
/**
* @test
*/
public function testPageMissing()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/page-does-not-exist.html');
// The built-in server returns this string.
$this->assertSame('Page Not Found', $web->title);
}
}

View file

@ -0,0 +1,49 @@
<?php
namespace Spekulatius\PHPScraper\Tests;
class OpenGraphTest extends \PHPUnit\Framework\TestCase
{
/**
* @test
*/
public function testMissingOpenGraph()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Go to the test page
$web->go('https://test-pages.phpscraper.de/meta/missing.html');
// Empty array, because there aren't any open graph props set.
$this->assertTrue(is_iterable($web->openGraph));
$this->assertTrue(empty($web->openGraph));
}
/**
* @test
*/
public function testOpenGraph()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/og/example.html');
// Check elements
$this->assertSame('Lorem Ipsum', $web->openGraph['og:title']);
$this->assertSame('Lorem ipsum dolor etc.', $web->openGraph['og:description']);
// The whole set.
$this->assertSame(
[
'og:site_name' => 'Lorem ipsum',
'og:type' => 'website',
'og:title' => 'Lorem Ipsum',
'og:description' => 'Lorem ipsum dolor etc.',
'og:url' => 'https://test-pages.phpscraper.de/meta/lorem-ipsum.html',
'og:image' => 'https://test-pages.phpscraper.de/assets/cat.jpg',
],
$web->openGraph
);
}
}

View file

@ -0,0 +1,129 @@
<?php
namespace Spekulatius\PHPScraper\Tests;
class OutlineTest extends \PHPUnit\Framework\TestCase
{
/**
* @test
*/
public function outlineTest()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
/**
* Navigate to the test page. This page contains:
*
* <h1>We are testing here!</h1>
* <p>This page contains an example structure to be parsed. It comes with a number of headings and nested paragraphs as an scrape example.</p>
*
* <h2>Examples</h2>
* <p>There are numerous examples on the website. Please check them out to get more context on how scraping works.</p>
*
* <h3>Example 1</h3>
* <p>Here would be an example.</p>
*
* <h3>Example 2</h3>
* <p>Here would be the second example.</p>
*
* <h3>Example 3</h3>
* <p>Here would be another example.</p>
*/
$web->go('https://test-pages.phpscraper.de/content/outline.html');
// Get the content outline
$this->assertSame(
[
[
'tag' => 'h1',
'content' => 'We are testing here!',
], [
'tag' => 'h2',
'content' => 'Examples',
], [
'tag' => 'h3',
'content' => 'Example 1',
], [
'tag' => 'h3',
'content' => 'Example 2',
], [
'tag' => 'h3',
'content' => 'Example 3',
],
],
$web->outline
);
}
/**
* @test
*/
public function outlineWithParagraphsTest()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
/**
* Navigate to the test page. This page contains:
*
* <h1>We are testing here!</h1>
* <p>This page contains an example structure to be parsed. It comes with a number of headings and nested paragraphs as an scrape example.</p>
*
* <h2>Examples</h2>
* <p>There are numerous examples on the website. Please check them out to get more context on how scraping works.</p>
*
* <h3>Example 1</h3>
* <p>Here would be an example.</p>
*
* <h3>Example 2</h3>
* <p>Here would be the second example.</p>
*
* <h3>Example 3</h3>
* <p>Here would be another example.</p>
*
* <!-- an empty paragraph to check if it gets filtered out correctly -->
* <p></p>
*/
$web->go('https://test-pages.phpscraper.de/content/outline.html');
// Get the content outline
$this->assertSame(
[
[
'tag' => 'h1',
'content' => 'We are testing here!',
], [
'tag' => 'p',
'content' => 'This page contains an example structure to be parsed. It comes with a number of headings and nested paragraphs as an scrape example.',
], [
'tag' => 'h2',
'content' => 'Examples',
], [
'tag' => 'p',
'content' => 'There are numerous examples on the website. Please check them out to get more context on how scraping works.',
], [
'tag' => 'h3',
'content' => 'Example 1',
], [
'tag' => 'p',
'content' => 'Here would be an example.',
], [
'tag' => 'h3',
'content' => 'Example 2',
], [
'tag' => 'p',
'content' => 'Here would be the second example.',
], [
'tag' => 'h3',
'content' => 'Example 3',
], [
'tag' => 'p',
'content' => 'Here would be another example.',
], [
'tag' => 'p',
'content' => '',
],
],
$web->outlineWithParagraphs
);
}
}

View file

@ -0,0 +1,87 @@
<?php
namespace Spekulatius\PHPScraper\Tests;
class ParagraphsTest extends \PHPUnit\Framework\TestCase
{
/**
* @test
*/
public function paragraphTest()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
/**
* Navigate to the test page. This page contains:
*
* <h1>We are testing here!</h1>
* <p>This page contains an example structure to be parsed. It comes with a number of headings and nested paragraphs as an scrape example.</p>
*
* <h2>Examples</h2>
* <p>There are numerous examples on the website. Please check them out to get more context on how scraping works.</p>
*
* <h3>Example 1</h3>
* <p>Here would be an example.</p>
*
* <h3>Example 2</h3>
* <p>Here would be the second example.</p>
*
* <h3>Example 3</h3>
* <p>Here would be another example.</p>
*
* <!-- an empty paragraph to check if it gets filtered out correctly -->
* <p></p>
*/
$web->go('https://test-pages.phpscraper.de/content/outline.html');
// Get the paragraphs
$this->assertSame([
'This page contains an example structure to be parsed. It comes with a number of headings and nested paragraphs as an scrape example.',
'There are numerous examples on the website. Please check them out to get more context on how scraping works.',
'Here would be an example.',
'Here would be the second example.',
'Here would be another example.',
'',
], $web->paragraphs);
}
/**
* @test
*/
public function cleanParagraphTest()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
/**
* Navigate to the test page. This page contains:
*
* <h1>We are testing here!</h1>
* <p>This page contains an example structure to be parsed. It comes with a number of headings and nested paragraphs as an scrape example.</p>
*
* <h2>Examples</h2>
* <p>There are numerous examples on the website. Please check them out to get more context on how scraping works.</p>
*
* <h3>Example 1</h3>
* <p>Here would be an example.</p>
*
* <h3>Example 2</h3>
* <p>Here would be the second example.</p>
*
* <h3>Example 3</h3>
* <p>Here would be another example.</p>
*
* <!-- an empty paragraph to check if it gets filtered out correctly -->
* <p></p>
*/
$web->go('https://test-pages.phpscraper.de/content/outline.html');
// Get the cleaned up paragraphs
$this->assertSame([
'This page contains an example structure to be parsed. It comes with a number of headings and nested paragraphs as an scrape example.',
'There are numerous examples on the website. Please check them out to get more context on how scraping works.',
'Here would be an example.',
'Here would be the second example.',
'Here would be another example.',
], $web->cleanParagraphs);
}
}

View file

@ -0,0 +1,408 @@
<?php
namespace Spekulatius\PHPScraper\Tests;
class ParserCsvTest extends \PHPUnit\Framework\TestCase
{
/**
* @test
*/
public function testCsvParsingContext()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// This tests ensures an exception is thrown, if no context is given.
// Context means either it's been navigated before (URL context) or get something to (fetch +) parse
try {
$web = new \Spekulatius\PHPScraper\PHPScraper;
$web->parseCsv();
} catch (\Exception $e) {
// Did we get the expected exception?
$this->assertSame(
'You can not call parseCsv() without parameter or initial navigation.',
$e->getMessage()
);
}
// This tests ensures an exception is thrown, if no context is given.
// Context means either it's been navigated before (URL context) or get something to (fetch +) parse
try {
$web = new \Spekulatius\PHPScraper\PHPScraper;
$web->parseCsvWithHeader();
} catch (\Exception $e) {
// Did we get the expected exception?
$this->assertSame(
'You can not call parseCsvWithHeader() without parameter or initial navigation.',
$e->getMessage()
);
}
}
/**
* @test
*/
public function testCsvDecodeRaw()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Only decoding
$this->assertSame(
[
['date', 'value'],
['1945-02-06', '4.20'],
['1952-03-11', '42'],
],
$web->csvDecodeRaw("date,value\n1945-02-06,4.20\n1952-03-11,42"),
);
// Fetching and decoding
$this->assertSame(
[
['date', 'value'],
['1945-02-06', '4.20'],
['1952-03-11', '42'],
],
$web->csvDecodeRaw($web->fetchAsset('https://test-pages.phpscraper.de/test.csv')),
);
}
/**
* @test
*/
public function testCsvDecode()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Only decoding
$this->assertSame(
[
['date', 'value'],
['1945-02-06', 4.20],
['1952-03-11', 42],
],
$web->csvDecode("date,value\n1945-02-06,4.20\n1952-03-11,42"),
);
// Fetching and decoding
$this->assertSame(
[
['date', 'value'],
['1945-02-06', 4.20],
['1952-03-11', 42],
],
$web->csvDecode($web->fetchAsset('https://test-pages.phpscraper.de/test.csv')),
);
}
/**
* Test with pipe as separator, enclosure and escape.
*
* @test
*/
public function testCsvDecodeAndCustomEncoding()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
$this->assertSame(
[
['date', 'value'],
['1945-02-06', 4.20],
['1952-03-11', 42],
['\\'],
],
$web->csvDecode(
"\"date\"|\"value\"\n\"1945-02-06\"|\"4.20\"\n\"1952-03-11\"|\"42\"\n\\",
'|',
'"',
'\\'
)
);
}
/**
* @test
*/
public function testCsvDecodeWithHeaderRaw()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Only decoding
$this->assertSame(
[
['date' => '1945-02-06', 'value' => '4.20'],
['date' => '1952-03-11', 'value' => '42'],
],
$web->csvDecodeWithHeaderRaw("date,value\n1945-02-06,4.20\n1952-03-11,42"),
);
// Fetching and decoding
$this->assertSame(
[
['date' => '1945-02-06', 'value' => '4.20'],
['date' => '1952-03-11', 'value' => '42'],
],
$web->csvDecodeWithHeaderRaw($web->fetchAsset('https://test-pages.phpscraper.de/test.csv')),
);
}
/**
* @test
*/
public function testCsvDecodeWithHeaderAndCasting()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
$this->assertSame(
[
['date' => '1945-02-06', 'value' => 4.20],
['date' => '1952-03-11', 'value' => 42],
],
$web->csvDecodeWithHeader("date,value\n1945-02-06,4.20\n1952-03-11,42"),
);
}
/**
* Test with header, pipe as separator, and enclosure.
*
* @test
*/
public function testCsvDecodeWithHeaderAndCustomEncoding()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
$this->assertSame(
[
['date' => '1945-02-06', 'value' => 4.20],
['date' => '1952-03-11', 'value' => 42],
],
$web->csvDecodeWithHeader(
"\"date\"|\"value\"\n\"1945-02-06\"|\"4.20\"\n\"1952-03-11\"|\"42\"",
'|',
'"',
'\\'
)
);
}
/**
* Check the pluming: Test the various ways to call `parseCsv()`.
*
* @test
*/
public function testDifferentCsvCalls()
{
// Downloads the PHPScraper sitemap and ensures the homepage is included (valid download and output).
$web = new \Spekulatius\PHPScraper\PHPScraper;
// For the reference we are using a simple CSV and parse it. This matches the hosted CSV.
$csvString = "date,value\n1945-02-06,4.20\n1952-03-11,42";
$csvData = [['date', 'value'], ['1945-02-06', 4.20], ['1952-03-11', 42]];
// Case 1: Passing in an CSV string in.
$this->assertSame(
// Pass the CSV Data as reference in.
$csvData,
// Parse the $csvString directly.
(new \Spekulatius\PHPScraper\PHPScraper)
->parseCsv($csvString)
);
// Case 2: `go` + `parseCsv()`
$this->assertSame(
// Pass the CSV Data as reference in.
$csvData,
// Chained call using a CSV file as URL.
(new \Spekulatius\PHPScraper\PHPScraper)
->go('https://test-pages.phpscraper.de/test.csv')
->parseCsv()
);
// Case 3: `parseCsv()` with absolute URL.
$this->assertSame(
// Pass the CSV Data as reference in.
$csvData,
// Pass the absolutely URL to `parseCsv()`
(new \Spekulatius\PHPScraper\PHPScraper)
->parseCsv('https://test-pages.phpscraper.de/test.csv')
);
// Case 4: `go` + `parseCsv()` with relative URL.
$this->assertSame(
// Pass the CSV Data as reference in.
$csvData,
// The 'go' sets the base URL for the following relative path.
(new \Spekulatius\PHPScraper\PHPScraper)
->go('https://test-pages.phpscraper.de/meta/feeds.html')
->parseCsv('/test.csv')
);
// Case 5: `go` with base URL + `go` with relative URL + `parseCsv()`.
// 5.1. Ensure the final URL is correct.
$this->assertSame(
'https://test-pages.phpscraper.de/test.csv',
// The first 'go' sets the base URL for the following `go` with relative URL.
(new \Spekulatius\PHPScraper\PHPScraper)
->go('https://test-pages.phpscraper.de/meta/feeds.html')
->go('/test.csv')
->currentUrl()
);
// 5.2. Ensure the parsed CSV is correct.
$this->assertSame(
// Pass the CSV Data as reference in.
$csvData,
// The first 'go' sets the base URL for the following `go` with relative URL.
(new \Spekulatius\PHPScraper\PHPScraper)
->go('https://test-pages.phpscraper.de/meta/feeds.html')
->go('/test.csv')
->parseCsv()
);
// Case 6: With encoding params
$this->assertSame(
// Pass the CSV Data as reference in.
$csvData,
// The first 'go' sets the base URL for the following `go` with relative URL.
(new \Spekulatius\PHPScraper\PHPScraper)
->go('https://test-pages.phpscraper.de/meta/feeds.html')
->go('/test-custom.csv')
->parseCsv(null, '|', '"')
);
// Case 7: With encoding params and (relative) URL
$this->assertSame(
// Pass the CSV Data as reference in.
$csvData,
// The first 'go' sets the base URL for the following `go` with relative URL.
(new \Spekulatius\PHPScraper\PHPScraper)
->go('https://test-pages.phpscraper.de/meta/feeds.html')
->parseCsv('/test-custom.csv', '|', '"')
);
}
/**
* Check the pluming: Test the various ways to call `parseCsvWithHeader()`.
*
* @test
*/
public function testDifferentCsvWithHeaderCalls()
{
// Downloads the PHPScraper sitemap and ensures the homepage is included (valid download and output).
$web = new \Spekulatius\PHPScraper\PHPScraper;
// For the reference we are using a simple CSV and parse it. This matches the hosted CSV.
$csvString = "date,value\n1945-02-06,4.20\n1952-03-11,42";
$csvData = [
['date' => '1945-02-06', 'value' => 4.20],
['date' => '1952-03-11', 'value' => 42],
];
// Case 1: Passing in an CSV string in.
$this->assertSame(
// Pass the CSV Data as reference in.
$csvData,
// Parse the $csvString directly.
(new \Spekulatius\PHPScraper\PHPScraper)
->parseCsvWithHeader($csvString)
);
// Case 2: `parseCsvWithHeader()`
$this->assertSame(
// Pass the CSV Data as reference in.
$csvData,
// Chained call using a CSV file as URL.
(new \Spekulatius\PHPScraper\PHPScraper)
->parseCsvWithHeader('https://test-pages.phpscraper.de/test.csv')
);
// Case 2: `go` + `parseCsvWithHeader()`
$this->assertSame(
// Pass the CSV Data as reference in.
$csvData,
// Chained call using a CSV file as URL.
(new \Spekulatius\PHPScraper\PHPScraper)
->go('https://test-pages.phpscraper.de/test.csv')
->parseCsvWithHeader()
);
// Case 3: `parseCsvWithHeader()` with absolute URL.
$this->assertSame(
// Pass the CSV Data as reference in.
$csvData,
// Pass the absolutely URL to `parseCsvWithHeader()`
(new \Spekulatius\PHPScraper\PHPScraper)
->parseCsvWithHeader('https://test-pages.phpscraper.de/test.csv')
);
// Case 4: `go` + `parseCsvWithHeader()` with relative URL.
$this->assertSame(
// Pass the CSV Data as reference in.
$csvData,
// The 'go' sets the base URL for the following relative path.
(new \Spekulatius\PHPScraper\PHPScraper)
->go('https://test-pages.phpscraper.de/meta/feeds.html')
->parseCsvWithHeader('/test.csv')
);
// Case 5: `go` with base URL + `go` with relative URL + `parseCsvWithHeader()`.
// 5.1. Ensure the final URL is correct.
$this->assertSame(
'https://test-pages.phpscraper.de/test.csv',
// The first 'go' sets the base URL for the following `go` with relative URL.
(new \Spekulatius\PHPScraper\PHPScraper)
->go('https://test-pages.phpscraper.de/meta/feeds.html')
->go('/test.csv')
->currentUrl()
);
// 5.2. Ensure the parsed CSV is correct.
$this->assertSame(
// Pass the CSV Data as reference in.
$csvData,
// The first 'go' sets the base URL for the following `go` with relative URL.
(new \Spekulatius\PHPScraper\PHPScraper)
->go('https://test-pages.phpscraper.de/meta/feeds.html')
->go('/test.csv')
->parseCsvWithHeader()
);
// Case 6: With encoding params
$this->assertSame(
// Pass the CSV Data as reference in.
$csvData,
// The first 'go' sets the base URL for the following `go` with relative URL.
(new \Spekulatius\PHPScraper\PHPScraper)
->go('https://test-pages.phpscraper.de/meta/feeds.html')
->go('/test-custom.csv')
->parseCsvWithHeader(null, '|', '"')
);
// Case 7: With encoding params and (relative) URL
$this->assertSame(
// Pass the CSV Data as reference in.
$csvData,
// The first 'go' sets the base URL for the following `go` with relative URL.
(new \Spekulatius\PHPScraper\PHPScraper)
->go('https://test-pages.phpscraper.de/meta/feeds.html')
->parseCsvWithHeader('/test-custom.csv', '|', '"')
);
}
}

View file

@ -0,0 +1,108 @@
<?php
namespace Spekulatius\PHPScraper\Tests;
class ParserJsonTest extends \PHPUnit\Framework\TestCase
{
/**
* @test
*/
public function testJsonParsingContext()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// This tests ensures an exception is thrown, if no context is given.
// Context means either it's been navigated before (URL context) or get something to (fetch +) parse
try {
$web = new \Spekulatius\PHPScraper\PHPScraper;
$web->parseJson();
} catch (\Exception $e) {
// Did we get the expected exception?
$this->assertSame(
'You can not call parseJson() without parameter or initial navigation.',
$e->getMessage()
);
}
}
/**
* Test the various ways to call `parseJson()`.
*
* @test
*/
public function testDifferentJsonCalls()
{
// Downloads the PHPScraper sitemap and ensures the homepage is included (valid download and output).
$web = new \Spekulatius\PHPScraper\PHPScraper;
// For the reference we are using a simple JSON and parse it.
$jsonString = $web->fetchAsset('https://test-pages.phpscraper.de/index.json');
$jsonData = json_decode($jsonString, true);
// Case 1: Passing in an JSON string in.
$this->assertSame(
// Pass the JSON Data as reference in.
$jsonData,
// Parse the $jsonString directly.
(new \Spekulatius\PHPScraper\PHPScraper)
->parseJson($jsonString)
);
// Case 2: `go` + `parseJson()`
$this->assertSame(
// Pass the JSON Data as reference in.
$jsonData,
// Chained call using a JSON file as URL.
(new \Spekulatius\PHPScraper\PHPScraper)
->go('https://test-pages.phpscraper.de/index.json')
->parseJson()
);
// Case 3: `parseJson()` with absolute URL.
$this->assertSame(
// Pass the JSON Data as reference in.
$jsonData,
// Pass the absolutely URL to `parseJson()`
(new \Spekulatius\PHPScraper\PHPScraper)
->parseJson('https://test-pages.phpscraper.de/index.json')
);
// Case 4: `go` + `parseJson()` with relative URL.
$this->assertSame(
// Pass the JSON Data as reference in.
$jsonData,
// The 'go' sets the base URL for the following relative path.
(new \Spekulatius\PHPScraper\PHPScraper)
->go('https://test-pages.phpscraper.de/meta/feeds.html')
->parseJson('/index.json')
);
// Case 5: `go` with base URL + `go` with relative URL + `parseJson()`.
// 5.1. Ensure the final URL is correct.
$this->assertSame(
'https://test-pages.phpscraper.de/index.json',
// The first 'go' sets the base URL for the following `go` with relative URL.
(new \Spekulatius\PHPScraper\PHPScraper)
->go('https://test-pages.phpscraper.de/meta/feeds.html')
->go('/index.json')
->currentUrl()
);
// 5.2. Ensure the parsed JSON is correct.
$this->assertSame(
// Pass the JSON Data as reference in.
$jsonData,
// The first 'go' sets the base URL for the following `go` with relative URL.
(new \Spekulatius\PHPScraper\PHPScraper)
->go('https://test-pages.phpscraper.de/meta/feeds.html')
->go('/index.json')
->parseJson()
);
}
}

View file

@ -0,0 +1,107 @@
<?php
namespace Spekulatius\PHPScraper\Tests;
class ParserXmlTest extends \PHPUnit\Framework\TestCase
{
/**
* @test
*/
public function testJsonParsingContext()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// This tests ensures an exception is thrown, if no context is given.
// Context means either it's been navigated before (URL context) or get something to (fetch +) parse
try {
$web = new \Spekulatius\PHPScraper\PHPScraper;
$web->parseXml();
} catch (\Exception $e) {
// Did we get the expected exception?
$this->assertSame(
'You can not call parseXml() without parameter or initial navigation.',
$e->getMessage()
);
}
}
/**
* @test
*/
public function testDifferentXmlCalls()
{
// Downloads the PHPScraper sitemap and ensures the homepage is included (valid download and output).
$web = new \Spekulatius\PHPScraper\PHPScraper;
// For the reference we are using a simple XML and parse it.
$xmlString = $web->fetchAsset('https://test-pages.phpscraper.de/sitemap.xml');
$xml = simplexml_load_string($xmlString, 'SimpleXMLElement', LIBXML_NOCDATA);
$xmlData = json_decode((string) json_encode($xml), true);
// Case 1: Passing in an XML string in.
$this->assertSame(
// Pass the XML Data as reference in.
$xmlData,
// Parse the XML string directly.
(new \Spekulatius\PHPScraper\PHPScraper)
->parseXml($xmlString)
);
// Case 2: `go` + `parseXml()`
$this->assertSame(
// Pass the XML Data as reference in.
$xmlData,
// Chained call with XML as URL
(new \Spekulatius\PHPScraper\PHPScraper)
->go('https://test-pages.phpscraper.de/sitemap.xml')
->parseXml()
);
// Case 3: `parseXml()` with absolute URL.
$this->assertSame(
// Pass the XML Data as reference in.
$xmlData,
// Pass the absolutely URL to `parseXml()`
(new \Spekulatius\PHPScraper\PHPScraper)
->parseXml('https://test-pages.phpscraper.de/sitemap.xml')
);
// Case 4: `go` + `parseXml()` with relative URL.
$this->assertSame(
// Pass the XML Data as reference in.
$xmlData,
// The 'go' sets the base URL for the following relative path.
(new \Spekulatius\PHPScraper\PHPScraper)
->go('https://test-pages.phpscraper.de/meta/feeds.html')
->parseXml('/sitemap.xml')
);
// Case 5: `go` with base URL + `go` with relative URL + `parseXml()`.
// 5.1. Ensure the final URL is correct.
$this->assertSame(
'https://test-pages.phpscraper.de/sitemap.xml',
// The first 'go' sets the base URL for the following `go` with relative URL.
(new \Spekulatius\PHPScraper\PHPScraper)
->go('https://test-pages.phpscraper.de/meta/feeds.html')
->go('/sitemap.xml')
->currentUrl()
);
// 5.2. Ensure the parsed JSON is correct.
$this->assertSame(
// Pass the XML Data as reference in.
$xmlData,
// The first 'go' sets the base URL for the following `go` with relative URL.
(new \Spekulatius\PHPScraper\PHPScraper)
->go('https://test-pages.phpscraper.de/meta/feeds.html')
->go('/sitemap.xml')
->parseXml()
);
}
}

View file

@ -0,0 +1,48 @@
<?php
namespace Spekulatius\PHPScraper\Tests;
class RedirectTest extends \PHPUnit\Framework\TestCase
{
/**
* @test
*/
public function testRedirect()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page: This redirects to phpscraper.de
$web->go('https://test-pages.phpscraper.de');
$this->assertNotSame(
$web->currentUrl,
'https://test-pages.phpscraper.de/'
);
$this->assertSame(
$web->currentUrl,
'https://phpscraper.de/'
);
}
/**
* @test
*/
public function testDisabledRedirect()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
$web->setConfig([
'follow_redirects' => false,
'follow_meta_refresh' => false,
'max_redirects' => -1,
]);
// Navigate to the test page: This redirects to phpscraper.de
$web->go('https://test-pages.phpscraper.de');
$this->assertSame(
'https://test-pages.phpscraper.de',
$web->currentUrl,
);
}
}

105
lib/sc/tests/TitleTest.php Normal file
View file

@ -0,0 +1,105 @@
<?php
namespace Spekulatius\PHPScraper\Tests;
class TitleTest extends \PHPUnit\Framework\TestCase
{
/**
* @test
*/
public function testMissingTitle()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/meta/missing.html');
// Check the title as not given (null)
$this->assertNull($web->title);
}
/**
* @test
*/
public function testWithHTMLEntity()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/meta/html-entities.html');
// Check the title
$this->assertSame(
'Cat & Mouse',
$web->title
);
}
/**
* @test
*/
public function testLoremIpsum()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/meta/lorem-ipsum.html');
// Check the title
$this->assertSame(
'Lorem Ipsum',
$web->title
);
}
/**
* @test
*/
public function testGermanUmlaute()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/meta/german-umlaute.html');
// Check the title
$this->assertSame(
'A page with plenty of German umlaute everywhere (ä ü ö)',
$web->title
);
}
/**
* @test
*/
public function testChineseCharacters()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/meta/chinese-characters.html');
// Check the title
$this->assertSame(
'Page with Chinese Characters all over the place (加油)',
$web->title
);
}
/**
* @test
*/
public function testLongTitle()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/title/long-title.html');
// Check the title
$this->assertSame(
'Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed mollis purus id ex consectetur facilisis. In gravida sodales nisl a consequat. Aenean ipsum sem, congue et rhoncus a, feugiat eget enim. Duis ut malesuada neque. Nam justo est, interdum eu massa in, volutpat vestibulum libero. Mauris a varius mauris, in vulputate ligula. Nulla rhoncus eget purus a sodales. Nulla facilisi. Proin purus purus, sodales non dolor in, lobortis elementum augue. Nulla sagittis, ex eu placerat varius, nulla mi rutrum odio, sit amet lacinia ipsum urna nec massa. Quisque posuere mauris id condimentum viverra.',
$web->title
);
}
}

View file

@ -0,0 +1,48 @@
<?php
namespace Spekulatius\PHPScraper\Tests;
class TwitterCardTest extends \PHPUnit\Framework\TestCase
{
/**
* @test
*/
public function testMissingTwitterCard()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Go to the test page
$web->go('https://test-pages.phpscraper.de/meta/missing.html');
// Empty array, because there aren't any twitter cards props set.
$this->assertTrue(is_iterable($web->twitterCard));
$this->assertTrue(empty($web->twitterCard));
}
/**
* @test
*/
public function testTwitterCard()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
$web->go('https://test-pages.phpscraper.de/twittercard/example.html');
// Check elements
$this->assertSame('summary_large_image', $web->twitterCard['twitter:card']);
$this->assertSame('Lorem Ipsum', $web->twitterCard['twitter:title']);
// The whole set.
$this->assertSame(
[
'twitter:card' => 'summary_large_image',
'twitter:title' => 'Lorem Ipsum',
'twitter:description' => 'Lorem ipsum dolor etc.',
'twitter:url' => 'https://test-pages.phpscraper.de/meta/lorem-ipsum.html',
'twitter:image' => 'https://test-pages.phpscraper.de/assets/cat.jpg',
],
$web->twitterCard
);
}
}

212
lib/sc/tests/UrlTest.php Normal file
View file

@ -0,0 +1,212 @@
<?php
namespace Spekulatius\PHPScraper\Tests;
/**
* Ensure our URL lib, https://github.com/thephpleague/uri, is integrated correctly and works as expected.
*/
class UrlTest extends \PHPUnit\Framework\TestCase
{
/**
* If null is passed to `makeUrlAbsolute`, it should always return null.
*
* @test
*/
public function testNullPassingThrough()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
$this->assertNull($web->makeUrlAbsolute(null));
}
/**
* @test
*/
public function validateUriTest()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// We use any URL for this.
$web->go('https://test-pages.phpscraper.de/content/lists.html');
// Ensure the URL is set correctly.
$this->assertSame(
'https://test-pages.phpscraper.de/content/lists.html',
$web->currentUrl
);
// Ensure the host is parsed correctly.
$this->assertSame(
'test-pages.phpscraper.de',
$web->currentHost
);
// Ensure the host with protocol is parsed correctly.
$this->assertSame(
'https://test-pages.phpscraper.de',
$web->currentBaseHost
);
}
/**
* @test
*/
public function testCurrentBaseHostWithBase()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to the test page.
// Contains: <base href="https://test-pages-with-base-href.phpscraper.de/">
$web->go('https://test-pages.phpscraper.de/meta/image/absolute-path-with-base-href.html');
// Check the base href being passed through the current base host.
$this->assertSame(
'https://test-pages-with-base-href.phpscraper.de',
$web->currentBaseHost
);
}
/**
* Basic processing of the URLs.
*
* @test
*/
public function testMakeUrlAbsolute()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to test page: This sets the base URL.
$web->go('https://phpscraper.de');
// Test variations of paths to be processed
// With leading slash
$this->assertSame(
'https://phpscraper.de/index.html',
$web->makeUrlAbsolute('/index.html'),
);
// Without leading slash
$this->assertSame(
'https://phpscraper.de/index.html',
$web->makeUrlAbsolute('index.html'),
);
// Paths are considered.
$this->assertSame(
'https://phpscraper.de/test/index.html',
$web->makeUrlAbsolute('test/index.html'),
);
// Absolutely URLs are untouched.
$this->assertSame(
'https://example.com/index.html',
$web->makeUrlAbsolute('https://example.com/index.html'),
);
// Protocol is considered
$this->assertSame(
'http://example.com/index.html',
$web->makeUrlAbsolute('http://example.com/index.html'),
);
}
/**
* Basic processing of the URLs.
*
* @test
*/
public function testMakeUrlAbsoluteConsiderBaseHref()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
/**
* Navigate to test page: This sets the base URL.
*
* It contains:
*
* ```html
* <base href="https://test-pages-with-base-href.phpscraper.de/">
* ```
*
* While it's located on `test-pages.phpscraper.de`.
*
* This page isn't actually used. It's purely to set the context.
*/
$web->go('https://test-pages.phpscraper.de/meta/image/absolute-path-with-base-href.html');
// Test variations of paths to be processed
// With leading slash
$this->assertSame(
'https://test-pages-with-base-href.phpscraper.de/index.html',
$web->makeUrlAbsolute('/index.html'),
);
// Without leading slash
$this->assertSame(
'https://test-pages-with-base-href.phpscraper.de/index.html',
$web->makeUrlAbsolute('index.html'),
);
// Paths are considered.
$this->assertSame(
'https://test-pages-with-base-href.phpscraper.de/test/index.html',
$web->makeUrlAbsolute('test/index.html'),
);
// Absolutely URLs are untouched.
$this->assertSame(
'https://example.com/index.html',
$web->makeUrlAbsolute('https://example.com/index.html'),
);
// Protocol is considered
$this->assertSame(
'http://example.com/index.html',
$web->makeUrlAbsolute('http://example.com/index.html'),
);
}
/**
* Test if passed in hosts are considered. It trumps any base-href and current url.
*
* @test
*/
public function testMakeUrlAbsoluteWithBaseHost()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;
// Navigate to test page: This sets the base URL.
$web->go('https://phpscraper.de');
// Test variations of paths to be processed
// With leading slash
$this->assertSame(
'https://example.com/index.html',
$web->makeUrlAbsolute('/index.html', 'https://example.com'),
);
// Without leading slash
$this->assertSame(
'https://example.com/index.html',
$web->makeUrlAbsolute('index.html', 'https://example.com'),
);
// Paths are considered.
$this->assertSame(
'https://example.com/test/index.html',
$web->makeUrlAbsolute('test/index.html', 'https://example.com'),
);
// Absolutely URLs are untouched.
$this->assertSame(
'https://example.com/index.html',
$web->makeUrlAbsolute('https://example.com/index.html', 'https://example-2.com/test/with/path'),
);
// Protocol is considered
$this->assertSame(
'http://example.com/index.html',
$web->makeUrlAbsolute('http://example.com/index.html', 'https://example-2.com/test/with/path'),
);
}
}