295 lines
No EOL
6.9 KiB
PHP
295 lines
No EOL
6.9 KiB
PHP
<?php
|
|
class pdf2txt
|
|
{
|
|
var $src;
|
|
var $dest;
|
|
var $data;
|
|
|
|
// constructor
|
|
function pdf2txt($_src = false, $_dest = false)
|
|
{
|
|
$this->setSource($_src);
|
|
$this->setDestination($_dest);
|
|
}
|
|
|
|
// set data if no conversion from file nescessary
|
|
function setInput($_data)
|
|
{
|
|
$this->data = $_data;
|
|
}
|
|
|
|
// sets the source-file
|
|
function setSource($_src)
|
|
{
|
|
$this->src = $_src;
|
|
}
|
|
|
|
// sets the destination-file
|
|
function setDestination($_dest)
|
|
{
|
|
$this->dest = $_dest;
|
|
}
|
|
|
|
function directConvert($_data)
|
|
{
|
|
$pdf2txt = new pdf2txt;
|
|
return $pdf2txt->convert($_data);
|
|
}
|
|
|
|
// convert to pdf
|
|
function convert($_data = false)
|
|
{
|
|
if(false !== $_data)
|
|
$this->data = $_data;
|
|
|
|
|
|
if(
|
|
// load from file?
|
|
(false !== $this->src) AND
|
|
// file exists?
|
|
(false === $this->data = file_get_contents($this->src))
|
|
)
|
|
{
|
|
// [ ERROR ]
|
|
// file does not exist
|
|
return false;
|
|
}
|
|
|
|
if($this->data === false)
|
|
{
|
|
// [ ERROR ]
|
|
// nothing to convert
|
|
return false;
|
|
}
|
|
|
|
|
|
// ###############################
|
|
// data available -> start parsing
|
|
// ###############################
|
|
|
|
// parse encoding
|
|
preg_match('~/Encoding\s*/(\w+)~ism', $this->data, $encoding);
|
|
|
|
// detect encoding and assume that there is only a single charset for the hole document
|
|
$fromEncoding = 'windows-1252';
|
|
switch($encoding[1])
|
|
{
|
|
case 'MacRomanEncoding':
|
|
$fromEncoding = 'macintosh';
|
|
break;
|
|
|
|
case 'WinAnsiEncoding':
|
|
// standard encoding
|
|
break;
|
|
}
|
|
|
|
// parse data
|
|
// the following code ignores the keyword "stream" and "endstream" if they are in a string
|
|
$isStream = false;
|
|
$stream = '';
|
|
$streams = array();
|
|
$openBracketCount = 0;
|
|
$encodedStream = false;
|
|
foreach(preg_split('~(<<\s*/.*?>>\s*stream\s*)|(\s*endstream\s*)|(\()|(\))~ism', $this->data, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY) as $k => $part)
|
|
{
|
|
if(preg_match('~<<\s*/(.*?)>>\s*stream\s*~ism', $part, $match))
|
|
{
|
|
$switch = 'stream';
|
|
if(false !== strpos($match[1], '/Filter'))
|
|
$encodedStream = true;
|
|
}
|
|
else
|
|
$switch = trim($part);
|
|
|
|
switch($switch)
|
|
{
|
|
case '(':
|
|
if($isStream AND !$encodedStream)
|
|
$openBracketCount++;
|
|
break;
|
|
|
|
case ')':
|
|
if($isStream AND !$encodedStream)
|
|
$openBracketCount--;
|
|
break;
|
|
|
|
case 'endstream':
|
|
if($isStream AND $openBracketCount <= 0)
|
|
{
|
|
$isStream = false;
|
|
$streams[] = $stream;
|
|
$stream = '';
|
|
$encodedStream = false;
|
|
}
|
|
break;
|
|
}
|
|
|
|
if($isStream)
|
|
{
|
|
$stream .= $part;
|
|
}
|
|
|
|
if($switch == 'stream')
|
|
{
|
|
if($isStream)
|
|
$stream .= $part;
|
|
else
|
|
$isStream = true;
|
|
}
|
|
}
|
|
|
|
$textObjects = array();
|
|
foreach($streams as $k => $stream)
|
|
{
|
|
// uncompress the stream
|
|
if(false === $uncompressed = @gzuncompress($stream))
|
|
// if nothing to uncompress, assume that the stream is already uncompressed
|
|
$uncompressed = $stream;
|
|
|
|
// convert to internal encoding UTF-8
|
|
$uncompressed = iconv($fromEncoding, 'UTF-8', $uncompressed);
|
|
|
|
// replace escaped brackets with placeholders
|
|
$text = str_replace(array('\(','\)','\[','\]'), array('##STARTBRACKET##','##ENDBRACKET##','##STARTSBRACKET##','##ENDSBRACKET##'), $uncompressed);
|
|
|
|
// parse streams
|
|
// the following code ignores the keyword "BT" and "ET" if they are in a string
|
|
$isTextObj = false;
|
|
$textObject = '';
|
|
$openBracketCount = 0;
|
|
foreach(preg_split('~(\s*BT\s+)|(\s+ET\s+)|(\()|(\))~ism', $text, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY) as $k => $part)
|
|
{
|
|
$switch = trim($part);
|
|
switch($switch)
|
|
{
|
|
case '(':
|
|
if($isTextObj)
|
|
$openBracketCount++;
|
|
break;
|
|
|
|
case ')':
|
|
if($isTextObj)
|
|
$openBracketCount--;
|
|
break;
|
|
|
|
case 'ET':
|
|
if($isTextObj AND $openBracketCount <= 0)
|
|
{
|
|
$isTextObj = false;
|
|
$textObjects[] = $textObject;
|
|
$textObject = '';
|
|
}
|
|
break;
|
|
}
|
|
|
|
if($isTextObj)
|
|
{
|
|
$textObject .= $part;
|
|
}
|
|
|
|
if($switch == 'BT')
|
|
{
|
|
if($isTextObj)
|
|
$textObject .= $part;
|
|
else
|
|
$isTextObj = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
$return = '';
|
|
foreach($textObjects as $textObject)
|
|
{
|
|
// parse text-objects
|
|
// the following code ignores PDF-keywords if they are in a string
|
|
$isString = false;
|
|
$openBracketCount = 0;
|
|
foreach(preg_split('~(?:\s+(Td|TD|T\*|"|\')\s+)|(\()|(\))~ism', $textObject, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY) as $k => $part)
|
|
{
|
|
switch($part)
|
|
{
|
|
// new line
|
|
case 'Td':
|
|
case 'TD':
|
|
case 'T*':
|
|
case '"':
|
|
case "'":
|
|
if(!$isString)
|
|
$return .= "\n";
|
|
break;
|
|
|
|
case ')':
|
|
if($isString AND $openBracketCount <= 0)
|
|
{
|
|
$isString = false;
|
|
$return .= $string;
|
|
$string = '';
|
|
}
|
|
elseif($isString)
|
|
$openBracketCount--;
|
|
break;
|
|
}
|
|
|
|
if($isString)
|
|
{
|
|
$string .= $part;
|
|
}
|
|
|
|
if($part == '(')
|
|
{
|
|
if($isString)
|
|
{
|
|
$openBracketCount++;
|
|
}
|
|
else
|
|
{
|
|
$isString = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
$return .= "\n";
|
|
}
|
|
|
|
// substitute the placeholders for the brackets and escape sequences
|
|
$convert = array(
|
|
'##STARTBRACKET##' => '(',
|
|
'##ENDBRACKET##' => ')',
|
|
'##STARTSBRACKET##' => '[',
|
|
'##ENDSBRACKET##' => ']',
|
|
"\\\n" => "\n",
|
|
"\\\r" => "\n",
|
|
"\\\n\r" => "\n",
|
|
"\\\t" => "\t",
|
|
"\\\b" => "\b",
|
|
"\\\f" => "\f",
|
|
'\\\\' => '\\'
|
|
);
|
|
|
|
// replace octal character codes
|
|
$text = preg_replace_callback(
|
|
'~\\\\([0-8]{3})~',
|
|
create_function(
|
|
'$matches',
|
|
' if(octdec($matches[1]) > 32)
|
|
return utf8_encode(chr(octdec($matches[1])));
|
|
else
|
|
return "";
|
|
'
|
|
),
|
|
$return
|
|
);
|
|
|
|
// execute conversion with $convert
|
|
$text = strtr(($text), $convert);
|
|
|
|
if(false !== $this->dest)
|
|
// store $text into the specified destination file
|
|
// and return true on success or false on error
|
|
return false !== file_put_contents($this->dest);
|
|
else
|
|
// return $text
|
|
return $text;
|
|
}
|
|
}
|
|
?>
|