init
This commit is contained in:
commit
72a26edcff
22092 changed files with 2101903 additions and 0 deletions
295
lib/class.pdf2txt2.inc.php
Normal file
295
lib/class.pdf2txt2.inc.php
Normal file
|
|
@ -0,0 +1,295 @@
|
|||
<?php
|
||||
class pdf2txt
|
||||
{
|
||||
var $src;
|
||||
var $dest;
|
||||
var $data;
|
||||
|
||||
// constructor
|
||||
function pdf2txt($_src = false, $_dest = false)
|
||||
{
|
||||
$this->setSource($_src);
|
||||
$this->setDestination($_dest);
|
||||
}
|
||||
|
||||
// set data if no conversion from file nescessary
|
||||
function setInput($_data)
|
||||
{
|
||||
$this->data = $_data;
|
||||
}
|
||||
|
||||
// sets the source-file
|
||||
function setSource($_src)
|
||||
{
|
||||
$this->src = $_src;
|
||||
}
|
||||
|
||||
// sets the destination-file
|
||||
function setDestination($_dest)
|
||||
{
|
||||
$this->dest = $_dest;
|
||||
}
|
||||
|
||||
function directConvert($_data)
|
||||
{
|
||||
$pdf2txt = new pdf2txt;
|
||||
return $pdf2txt->convert($_data);
|
||||
}
|
||||
|
||||
// convert to pdf
|
||||
function convert($_data = false)
|
||||
{
|
||||
if(false !== $_data)
|
||||
$this->data = $_data;
|
||||
|
||||
|
||||
if(
|
||||
// load from file?
|
||||
(false !== $this->src) AND
|
||||
// file exists?
|
||||
(false === $this->data = file_get_contents($this->src))
|
||||
)
|
||||
{
|
||||
// [ ERROR ]
|
||||
// file does not exist
|
||||
return false;
|
||||
}
|
||||
|
||||
if($this->data === false)
|
||||
{
|
||||
// [ ERROR ]
|
||||
// nothing to convert
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
// ###############################
|
||||
// data available -> start parsing
|
||||
// ###############################
|
||||
|
||||
// parse encoding
|
||||
preg_match('~/Encoding\s*/(\w+)~ism', $this->data, $encoding);
|
||||
|
||||
// detect encoding and assume that there is only a single charset for the hole document
|
||||
$fromEncoding = 'windows-1252';
|
||||
switch($encoding[1])
|
||||
{
|
||||
case 'MacRomanEncoding':
|
||||
$fromEncoding = 'macintosh';
|
||||
break;
|
||||
|
||||
case 'WinAnsiEncoding':
|
||||
// standard encoding
|
||||
break;
|
||||
}
|
||||
|
||||
// parse data
|
||||
// the following code ignores the keyword "stream" and "endstream" if they are in a string
|
||||
$isStream = false;
|
||||
$stream = '';
|
||||
$streams = array();
|
||||
$openBracketCount = 0;
|
||||
$encodedStream = false;
|
||||
foreach(preg_split('~(<<\s*/.*?>>\s*stream\s*)|(\s*endstream\s*)|(\()|(\))~ism', $this->data, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY) as $k => $part)
|
||||
{
|
||||
if(preg_match('~<<\s*/(.*?)>>\s*stream\s*~ism', $part, $match))
|
||||
{
|
||||
$switch = 'stream';
|
||||
if(false !== strpos($match[1], '/Filter'))
|
||||
$encodedStream = true;
|
||||
}
|
||||
else
|
||||
$switch = trim($part);
|
||||
|
||||
switch($switch)
|
||||
{
|
||||
case '(':
|
||||
if($isStream AND !$encodedStream)
|
||||
$openBracketCount++;
|
||||
break;
|
||||
|
||||
case ')':
|
||||
if($isStream AND !$encodedStream)
|
||||
$openBracketCount--;
|
||||
break;
|
||||
|
||||
case 'endstream':
|
||||
if($isStream AND $openBracketCount <= 0)
|
||||
{
|
||||
$isStream = false;
|
||||
$streams[] = $stream;
|
||||
$stream = '';
|
||||
$encodedStream = false;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
if($isStream)
|
||||
{
|
||||
$stream .= $part;
|
||||
}
|
||||
|
||||
if($switch == 'stream')
|
||||
{
|
||||
if($isStream)
|
||||
$stream .= $part;
|
||||
else
|
||||
$isStream = true;
|
||||
}
|
||||
}
|
||||
|
||||
$textObjects = array();
|
||||
foreach($streams as $k => $stream)
|
||||
{
|
||||
// uncompress the stream
|
||||
if(false === $uncompressed = @gzuncompress($stream))
|
||||
// if nothing to uncompress, assume that the stream is already uncompressed
|
||||
$uncompressed = $stream;
|
||||
|
||||
// convert to internal encoding UTF-8
|
||||
$uncompressed = iconv($fromEncoding, 'UTF-8', $uncompressed);
|
||||
|
||||
// replace escaped brackets with placeholders
|
||||
$text = str_replace(array('\(','\)','\[','\]'), array('##STARTBRACKET##','##ENDBRACKET##','##STARTSBRACKET##','##ENDSBRACKET##'), $uncompressed);
|
||||
|
||||
// parse streams
|
||||
// the following code ignores the keyword "BT" and "ET" if they are in a string
|
||||
$isTextObj = false;
|
||||
$textObject = '';
|
||||
$openBracketCount = 0;
|
||||
foreach(preg_split('~(\s*BT\s+)|(\s+ET\s+)|(\()|(\))~ism', $text, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY) as $k => $part)
|
||||
{
|
||||
$switch = trim($part);
|
||||
switch($switch)
|
||||
{
|
||||
case '(':
|
||||
if($isTextObj)
|
||||
$openBracketCount++;
|
||||
break;
|
||||
|
||||
case ')':
|
||||
if($isTextObj)
|
||||
$openBracketCount--;
|
||||
break;
|
||||
|
||||
case 'ET':
|
||||
if($isTextObj AND $openBracketCount <= 0)
|
||||
{
|
||||
$isTextObj = false;
|
||||
$textObjects[] = $textObject;
|
||||
$textObject = '';
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
if($isTextObj)
|
||||
{
|
||||
$textObject .= $part;
|
||||
}
|
||||
|
||||
if($switch == 'BT')
|
||||
{
|
||||
if($isTextObj)
|
||||
$textObject .= $part;
|
||||
else
|
||||
$isTextObj = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
$return = '';
|
||||
foreach($textObjects as $textObject)
|
||||
{
|
||||
// parse text-objects
|
||||
// the following code ignores PDF-keywords if they are in a string
|
||||
$isString = false;
|
||||
$openBracketCount = 0;
|
||||
foreach(preg_split('~(?:\s+(Td|TD|T\*|"|\')\s+)|(\()|(\))~ism', $textObject, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY) as $k => $part)
|
||||
{
|
||||
switch($part)
|
||||
{
|
||||
// new line
|
||||
case 'Td':
|
||||
case 'TD':
|
||||
case 'T*':
|
||||
case '"':
|
||||
case "'":
|
||||
if(!$isString)
|
||||
$return .= "\n";
|
||||
break;
|
||||
|
||||
case ')':
|
||||
if($isString AND $openBracketCount <= 0)
|
||||
{
|
||||
$isString = false;
|
||||
$return .= $string;
|
||||
$string = '';
|
||||
}
|
||||
elseif($isString)
|
||||
$openBracketCount--;
|
||||
break;
|
||||
}
|
||||
|
||||
if($isString)
|
||||
{
|
||||
$string .= $part;
|
||||
}
|
||||
|
||||
if($part == '(')
|
||||
{
|
||||
if($isString)
|
||||
{
|
||||
$openBracketCount++;
|
||||
}
|
||||
else
|
||||
{
|
||||
$isString = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
$return .= "\n";
|
||||
}
|
||||
|
||||
// substitute the placeholders for the brackets and escape sequences
|
||||
$convert = array(
|
||||
'##STARTBRACKET##' => '(',
|
||||
'##ENDBRACKET##' => ')',
|
||||
'##STARTSBRACKET##' => '[',
|
||||
'##ENDSBRACKET##' => ']',
|
||||
"\\\n" => "\n",
|
||||
"\\\r" => "\n",
|
||||
"\\\n\r" => "\n",
|
||||
"\\\t" => "\t",
|
||||
"\\\b" => "\b",
|
||||
"\\\f" => "\f",
|
||||
'\\\\' => '\\'
|
||||
);
|
||||
|
||||
// replace octal character codes
|
||||
$text = preg_replace_callback(
|
||||
'~\\\\([0-8]{3})~',
|
||||
create_function(
|
||||
'$matches',
|
||||
' if(octdec($matches[1]) > 32)
|
||||
return utf8_encode(chr(octdec($matches[1])));
|
||||
else
|
||||
return "";
|
||||
'
|
||||
),
|
||||
$return
|
||||
);
|
||||
|
||||
// execute conversion with $convert
|
||||
$text = strtr(($text), $convert);
|
||||
|
||||
if(false !== $this->dest)
|
||||
// store $text into the specified destination file
|
||||
// and return true on success or false on error
|
||||
return false !== file_put_contents($this->dest);
|
||||
else
|
||||
// return $text
|
||||
return $text;
|
||||
}
|
||||
}
|
||||
?>
|
||||
Loading…
Add table
Add a link
Reference in a new issue