init

2025-08-11 22:23:30 +02:00 · 2025-08-11 22:23:30 +02:00 · 72a26edcff
commit 72a26edcff
22092 changed files with 2101903 additions and 0 deletions
--- a/lib/PdfParser/Parser.php
+++ b/lib/PdfParser/Parser.php
@ -0,0 +1,314 @@
+<?php
+
+/**
+ * @file
+ *          This file is part of the PdfParser library.
+ *
+ * @author  Sébastien MALOT <sebastien@malot.fr>
+ * @date    2017-01-03
+ * @license LGPLv3
+ * @url     <https://github.com/smalot/pdfparser>
+ *
+ *  PdfParser is a pdf library written in PHP, extraction oriented.
+ *  Copyright (C) 2017 - Sébastien MALOT <sebastien@malot.fr>
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU Lesser General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public License
+ *  along with this program.
+ *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
+ *
+ */
+
+namespace Smalot\PdfParser;
+
+use Smalot\PdfParser\Element\ElementArray;
+use Smalot\PdfParser\Element\ElementBoolean;
+use Smalot\PdfParser\Element\ElementDate;
+use Smalot\PdfParser\Element\ElementHexa;
+use Smalot\PdfParser\Element\ElementName;
+use Smalot\PdfParser\Element\ElementNull;
+use Smalot\PdfParser\Element\ElementNumeric;
+use Smalot\PdfParser\Element\ElementString;
+use Smalot\PdfParser\Element\ElementXRef;
+
+/**
+ * Class Parser
+ *
+ * @package Smalot\PdfParser
+ */
+class Parser
+{
+    /**
+     * @var Object[]
+     */
+    protected $objects = array();
+
+    /**
+     *
+     */
+    public function __construct()
+    {
+
+    }
+
+    /**
+     * Parse PDF file
+     *
+     * @param string $filename
+     *
+     * @return Document
+     */
+    public function parseFile($filename)
+    {
+        $content = file_get_contents($filename);
+
+        return @$this->parseContent($content);
+    }
+
+    /**
+     * Parse PDF content
+     *
+     * @param string $content
+     *
+     * @return Document
+     */
+    public function parseContent($content)
+    {
+        // Create structure using TCPDF Parser.
+        ob_start();
+        @$parser = new \TCPDF_PARSER(ltrim($content));
+        list($xref, $data) = $parser->getParsedData();
+        unset($parser);
+        ob_end_clean();
+
+        if (isset($xref['trailer']['encrypt'])) {
+            throw new \Exception('Secured pdf file are currently not supported.');
+        }
+
+        if (empty($data)) {
+            throw new \Exception('Object list not found. Possible secured file.');
+        }
+
+        // Create destination object.
+        $document      = new Document();
+        $this->objects = array();
+
+        foreach ($data as $id => $structure) {
+            $this->parseObject($id, $structure, $document);
+            unset($data[$id]);
+        }
+
+        $document->setTrailer($this->parseTrailer($xref['trailer'], $document));
+        $document->setObjects($this->objects);
+
+        return $document;
+    }
+
+    protected function parseTrailer($structure, $document)
+    {
+        $trailer = array();
+
+        foreach ($structure as $name => $values) {
+            $name = ucfirst($name);
+
+            if (is_numeric($values)) {
+                $trailer[$name] = new ElementNumeric($values, $document);
+            } elseif (is_array($values)) {
+                $value          = $this->parseTrailer($values, null);
+                $trailer[$name] = new ElementArray($value, null);
+            } elseif (strpos($values, '_') !== false) {
+                $trailer[$name] = new ElementXRef($values, $document);
+            } else {
+                $trailer[$name] = $this->parseHeaderElement('(', $values, $document);
+            }
+        }
+
+        return new Header($trailer, $document);
+    }
+
+    /**
+     * @param string   $id
+     * @param array    $structure
+     * @param Document $document
+     */
+    protected function parseObject($id, $structure, $document)
+    {
+        $header  = new Header(array(), $document);
+        $content = '';
+
+        foreach ($structure as $position => $part) {
+            switch ($part[0]) {
+                case '[':
+                    $elements = array();
+
+                    foreach ($part[1] as $sub_element) {
+                        $sub_type   = $sub_element[0];
+                        $sub_value  = $sub_element[1];
+                        $elements[] = $this->parseHeaderElement($sub_type, $sub_value, $document);
+                    }
+
+                    $header = new Header($elements, $document);
+                    break;
+
+                case '<<':
+                    $header = $this->parseHeader($part[1], $document);
+                    break;
+
+                case 'stream':
+                    $content = isset($part[3][0]) ? $part[3][0] : $part[1];
+
+                    if ($header->get('Type')->equals('ObjStm')) {
+                        $match = array();
+
+                        // Split xrefs and contents.
+                        preg_match('/^((\d+\s+\d+\s*)*)(.*)$/s', $content, $match);
+                        $content = $match[3];
+
+                        // Extract xrefs.
+                        $xrefs = preg_split(
+                            '/(\d+\s+\d+\s*)/s',
+                            $match[1],
+                            -1,
+                          PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE
+                        );
+                        $table = array();
+
+                        foreach ($xrefs as $xref) {
+                            list($id, $position) = explode(' ', trim($xref));
+                            $table[$position] = $id;
+                        }
+
+                        ksort($table);
+
+                        $ids       = array_values($table);
+                        $positions = array_keys($table);
+
+                        foreach ($positions as $index => $position) {
+                            $id            = $ids[$index] . '_0';
+                            $next_position = isset($positions[$index + 1]) ? $positions[$index + 1] : strlen($content);
+                            $sub_content   = substr($content, $position, $next_position - $position);
+
+                            $sub_header         = Header::parse($sub_content, $document);
+                            $object             = Object::factory($document, $sub_header, '');
+                            $this->objects[$id] = $object;
+                        }
+
+                        // It is not necessary to store this content.
+                        $content = '';
+
+                        return;
+                    }
+                    break;
+
+                default:
+                    if ($part != 'null') {
+                        $element = $this->parseHeaderElement($part[0], $part[1], $document);
+
+                        if ($element) {
+                            $header = new Header(array($element), $document);
+                        }
+                    }
+                    break;
+
+            }
+        }
+
+        if (!isset($this->objects[$id])) {
+            $this->objects[$id] = Object::factory($document, $header, $content);
+        }
+    }
+
+    /**
+     * @param array    $structure
+     * @param Document $document
+     *
+     * @return Header
+     * @throws \Exception
+     */
+    protected function parseHeader($structure, $document)
+    {
+        $elements = array();
+        $count    = count($structure);
+
+        for ($position = 0; $position < $count; $position += 2) {
+            $name  = $structure[$position][1];
+            $type  = $structure[$position + 1][0];
+            $value = $structure[$position + 1][1];
+
+            $elements[$name] = $this->parseHeaderElement($type, $value, $document);
+        }
+
+        return new Header($elements, $document);
+    }
+
+    /**
+     * @param $type
+     * @param $value
+     * @param $document
+     *
+     * @return Element|Header
+     * @throws \Exception
+     */
+    protected function parseHeaderElement($type, $value, $document)
+    {
+        switch ($type) {
+            case '<<':
+                return $this->parseHeader($value, $document);
+
+            case 'numeric':
+                return new ElementNumeric($value, $document);
+
+            case 'boolean':
+                return new ElementBoolean($value, $document);
+
+            case 'null':
+                return new ElementNull($value, $document);
+
+            case '(':
+                if ($date = ElementDate::parse('(' . $value . ')', $document)) {
+                    return $date;
+                } else {
+                    return ElementString::parse('(' . $value . ')', $document);
+                }
+
+            case '<':
+                return $this->parseHeaderElement('(', ElementHexa::decode($value, $document), $document);
+
+            case '/':
+                return ElementName::parse('/' . $value, $document);
+
+            case 'ojbref': // old mistake in tcpdf parser
+            case 'objref':
+                return new ElementXRef($value, $document);
+
+            case '[':
+                $values = array();
+
+                foreach ($value as $sub_element) {
+                    $sub_type  = $sub_element[0];
+                    $sub_value = $sub_element[1];
+                    $values[]  = $this->parseHeaderElement($sub_type, $sub_value, $document);
+                }
+
+                return new ElementArray($values, $document);
+
+            case 'endstream':
+            case 'obj': //I don't know what it means but got my project fixed.
+            case '':
+                // Nothing to do with.
+                break;
+
+            default:
+                throw new \Exception('Invalid type: "' . $type . '".');
+        }
+    }
+}