jb-data.de/lib/SimpleExcel/Parser/HTMLParser.php
2025-08-11 22:23:30 +02:00

95 lines
3.1 KiB
PHP

<?php
namespace SimpleExcel\Parser;
use SimpleExcel\Exception\SimpleExcelException;
/**
* SimpleExcel class for parsing HTML table
*
* @author Faisalman
* @package SimpleExcel
*/
class HTMLParser extends BaseParser implements IParser
{
/**
* Defines valid file extension
*
* @access protected
* @var string
*/
protected $file_extension = 'html';
/**
* Process the loaded file/string
*
* @param DOMDocument $html DOMDocument object of HTML
*/
private function parseDOM($html){
$tables = $html->getElementsByTagName('table');
$field = array();
foreach ($tables as $table) {
$table_child = $table->childNodes;
foreach ($table_child as $twrap) {
if($twrap->nodeType === XML_ELEMENT_NODE) {
if ($twrap->nodeName === "thead" || $twrap->nodeName === "tbody") {
$twrap_child = $twrap->childNodes;
foreach ($twrap_child as $tr) {
if($tr->nodeType === XML_ELEMENT_NODE && $tr->nodeName === "tr") {
$row = array();
$tr_child = $tr->childNodes;
foreach ($tr_child as $td) {
if ($td->nodeType === XML_ELEMENT_NODE && ($td->nodeName === "th" || $td->nodeName === "td")) {
array_push($row, $td->nodeValue);
}
}
array_push($field, $row);
}
}
} else if ($twrap->nodeName === "tr") {
$row = array();
$twrap_child = $twrap->childNodes;
foreach ($twrap_child as $td) {
if ($td->nodeType === XML_ELEMENT_NODE && ($td->nodeName === "th" || $td->nodeName === "td")) {
array_push($row, $td->nodeValue);
}
}
array_push($field, $row);
}
}
}
}
$this->table_arr = $field;
}
/**
* Load the HTML file to be parsed
*
* @param string $file_path Path to HTML file
*/
public function loadFile($file_path) {
if (!$this->isFileReady($file_path)) {
return;
}
$html = new \DOMDocument('1.0', 'UTF-8');
$sp = mb_convert_encoding(file_get_contents($file_path), 'HTML-ENTITIES', "UTF-8");
$html->loadHTML($sp);
$html->encoding = 'UTF-8';
$this->parseDOM($html);
}
/**
* Load the string to be parsed
*
* @param string $str String with HTML format
*/
public function loadString($str){
$html = new \DOMDocument('1.0', 'UTF-8');
$sp = mb_convert_encoding($str, 'HTML-ENTITIES', "UTF-8");
$html->loadHTML($sp);
$html->encoding = 'UTF-8';
$this->parseDOM($html);
}
}