init

2025-08-11 22:23:30 +02:00 · 2025-08-11 22:23:30 +02:00 · 72a26edcff
commit 72a26edcff
22092 changed files with 2101903 additions and 0 deletions
--- a/lib/class.pdf2txt2.inc.php
+++ b/lib/class.pdf2txt2.inc.php
@ -0,0 +1,295 @@
+<?php
+class pdf2txt
+{
+  var $src;
+  var $dest;
+  var $data;
+  
+  // constructor
+  function pdf2txt($_src = false, $_dest = false)
+  {
+    $this->setSource($_src);
+    $this->setDestination($_dest);
+  }
+  
+  // set data if no conversion from file nescessary
+  function setInput($_data)
+  {
+    $this->data = $_data;
+  }
+  
+  // sets the source-file
+  function setSource($_src)
+  {
+    $this->src = $_src;
+  }
+  
+  // sets the destination-file
+  function setDestination($_dest)
+  {
+    $this->dest = $_dest;
+  }
+  
+  function directConvert($_data)
+  {
+    $pdf2txt = new pdf2txt;
+    return $pdf2txt->convert($_data);
+  }
+  
+  // convert to pdf
+  function convert($_data = false)
+  {
+    if(false !== $_data)
+      $this->data = $_data;
+    
+    
+    if(
+      // load from file?
+      (false !== $this->src) AND
+      // file exists?
+      (false === $this->data = file_get_contents($this->src))
+    )
+    {
+      // [ ERROR ]
+      // file does not exist
+      return false;
+    }
+    
+    if($this->data === false)
+    {
+      // [ ERROR ]
+      // nothing to convert
+      return false;
+    }
+    
+    
+    // ###############################
+    // data available -> start parsing
+    // ###############################
+    
+    // parse encoding
+    preg_match('~/Encoding\s*/(\w+)~ism', $this->data, $encoding);
+    
+    // detect encoding and assume that there is only a single charset for the hole document
+    $fromEncoding = 'windows-1252';
+    switch($encoding[1])
+    {
+      case 'MacRomanEncoding':
+        $fromEncoding = 'macintosh';
+      break;
+      
+      case 'WinAnsiEncoding':
+        // standard encoding
+      break;
+    }
+    
+    // parse data
+    // the following code ignores the keyword "stream" and "endstream" if they are in a string
+    $isStream = false;
+    $stream = '';
+    $streams = array();
+    $openBracketCount = 0;
+    $encodedStream = false;
+    foreach(preg_split('~(<<\s*/.*?>>\s*stream\s*)|(\s*endstream\s*)|(\()|(\))~ism', $this->data, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY) as $k => $part)
+    {
+      if(preg_match('~<<\s*/(.*?)>>\s*stream\s*~ism', $part, $match))
+      {
+        $switch = 'stream';
+        if(false !== strpos($match[1], '/Filter'))
+          $encodedStream = true;
+      }
+      else
+        $switch = trim($part);
+
+      switch($switch)
+      {
+        case '(':
+          if($isStream AND !$encodedStream)
+            $openBracketCount++;
+        break;
+        
+        case ')':
+          if($isStream AND !$encodedStream)
+            $openBracketCount--;
+        break;
+        
+        case 'endstream':
+          if($isStream AND $openBracketCount <= 0)
+          {
+            $isStream = false;
+            $streams[] = $stream;
+            $stream = '';
+            $encodedStream = false;
+          }
+        break;
+      }
+      
+      if($isStream)
+      {
+        $stream .= $part;
+      }
+      
+      if($switch == 'stream')
+      {
+        if($isStream)
+          $stream .= $part;
+        else
+          $isStream = true;
+      }
+    }
+    
+    $textObjects = array();
+    foreach($streams as $k => $stream)
+    {
+      // uncompress the stream
+      if(false === $uncompressed = @gzuncompress($stream))
+        // if nothing to uncompress, assume that the stream is already uncompressed
+        $uncompressed = $stream;
+      
+      // convert to internal encoding UTF-8
+      $uncompressed = iconv($fromEncoding, 'UTF-8', $uncompressed);
+      
+      // replace escaped brackets with placeholders
+      $text = str_replace(array('\(','\)','\[','\]'), array('##STARTBRACKET##','##ENDBRACKET##','##STARTSBRACKET##','##ENDSBRACKET##'), $uncompressed);
+      
+      // parse streams
+      // the following code ignores the keyword "BT" and "ET" if they are in a string
+      $isTextObj = false;
+      $textObject = '';
+      $openBracketCount = 0;
+      foreach(preg_split('~(\s*BT\s+)|(\s+ET\s+)|(\()|(\))~ism', $text, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY) as $k => $part)
+      {
+        $switch = trim($part);
+        switch($switch)
+        {
+          case '(':
+            if($isTextObj)
+              $openBracketCount++;
+          break;
+          
+          case ')':
+            if($isTextObj)
+              $openBracketCount--;
+          break;
+          
+          case 'ET':
+            if($isTextObj AND $openBracketCount <= 0)
+            {
+              $isTextObj = false;
+              $textObjects[] = $textObject;
+              $textObject = '';
+            }
+          break;
+        }
+        
+        if($isTextObj)
+        {
+          $textObject .= $part;
+        }
+        
+        if($switch == 'BT')
+        {
+          if($isTextObj)
+            $textObject .= $part;
+          else
+            $isTextObj = true;
+        }
+      }
+    }
+    
+    $return = '';
+    foreach($textObjects as $textObject)
+    {
+      // parse text-objects
+      // the following code ignores PDF-keywords if they are in a string
+      $isString = false;
+      $openBracketCount = 0;
+      foreach(preg_split('~(?:\s+(Td|TD|T\*|"|\')\s+)|(\()|(\))~ism', $textObject, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY) as $k => $part)
+      {
+        switch($part)
+        {
+          // new line
+          case 'Td':
+          case 'TD':
+          case 'T*':
+          case '"':
+          case "'":
+            if(!$isString)
+              $return .= "\n";
+          break;
+          
+          case ')':
+            if($isString AND $openBracketCount <= 0)
+            {
+              $isString = false;
+              $return .= $string;
+              $string = '';
+            }
+            elseif($isString)
+              $openBracketCount--;
+          break;
+        }
+        
+        if($isString)
+        {
+          $string .= $part;
+        }
+        
+        if($part == '(')
+        {
+          if($isString)
+          {
+            $openBracketCount++;
+          }
+          else
+          {
+            $isString = true;
+          }
+        }
+      }
+      
+      $return .= "\n";
+    }
+    
+    // substitute the placeholders for the brackets and escape sequences
+    $convert = array(
+      '##STARTBRACKET##' => '(',
+      '##ENDBRACKET##' => ')',
+      '##STARTSBRACKET##' => '[',
+      '##ENDSBRACKET##' => ']',
+      "\\\n" => "\n",
+      "\\\r" => "\n",
+      "\\\n\r" => "\n",
+      "\\\t" => "\t",
+      "\\\b" => "\b",
+      "\\\f" => "\f",
+      '\\\\' => '\\'
+    );
+    
+    // replace octal character codes
+    $text = preg_replace_callback(
+      '~\\\\([0-8]{3})~',
+      create_function(
+        '$matches',
+        ' if(octdec($matches[1]) > 32)  
+            return utf8_encode(chr(octdec($matches[1])));
+          else
+            return "";
+        '
+      ),
+      $return
+    );
+    
+    // execute conversion with $convert
+    $text = strtr(($text), $convert);
+    
+    if(false !== $this->dest)
+      // store $text into the specified destination file
+      // and return true on success or false on error
+      return false !== file_put_contents($this->dest);
+    else
+      // return $text
+      return $text;
+  }
+}
+?>