setSource($_src); $this->setDestination($_dest); } // set data if no conversion from file nescessary function setInput($_data) { $this->data = $_data; } // sets the source-file function setSource($_src) { $this->src = $_src; } // sets the destination-file function setDestination($_dest) { $this->dest = $_dest; } function directConvert($_data) { $pdf2txt = new pdf2txt(); return $pdf2txt->convert($_data); } // convert to pdf function convert($_data = false) { if (false !== $_data) { $this->data = $_data; } if (false !== $this->src and false === ($this->data = file_get_contents($this->src))) { // [ ERROR ] // file does not exist return false; } if ($this->data === false) { // [ ERROR ] // nothing to convert return false; } // ############################### // data available -> start parsing // ############################### // parse encoding preg_match('~/Encoding\\s*/(\\w+)~ism', $this->data, $encoding); // detect encoding and assume that there is only a single charset for the hole document $fromEncoding = 'windows-1252'; switch ($encoding[1]) { case 'MacRomanEncoding': $fromEncoding = 'macintosh'; break; case 'WinAnsiEncoding': // standard encoding break; } // parse data // the following code ignores the keyword "stream" and "endstream" if they are in a string $isStream = false; $stream = ''; $streams = array(); $openBracketCount = 0; $encodedStream = false; foreach (preg_split('~(<<\\s*/.*?>>\\s*stream\\s*)|(\\s*endstream\\s*)|(\\()|(\\))~ism', $this->data, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY) as $k => $part) { if (preg_match('~<<\\s*/(.*?)>>\\s*stream\\s*~ism', $part, $match)) { $switch = 'stream'; if (false !== strpos($match[1], '/Filter')) { $encodedStream = true; } } else { $switch = trim($part); } switch ($switch) { case '(': if ($isStream and !$encodedStream) { $openBracketCount++; } break; case ')': if ($isStream and !$encodedStream) { $openBracketCount--; } break; case 'endstream': if ($isStream and $openBracketCount <= 0) { $isStream = false; $streams[] = $stream; $stream = ''; $encodedStream = false; } break; } if ($isStream) { $stream .= $part; } if ($switch == 'stream') { if ($isStream) { $stream .= $part; } else { $isStream = true; } } } $textObjects = array(); foreach ($streams as $k => $stream) { // uncompress the stream if (false === ($uncompressed = @gzuncompress($stream))) { // if nothing to uncompress, assume that the stream is already uncompressed $uncompressed = $stream; } // convert to internal encoding UTF-8 $uncompressed = @iconv($fromEncoding, 'UTF-8', $uncompressed); // replace escaped brackets with placeholders $text = str_replace(array('\\(', '\\)', '\\[', '\\]'), array('##STARTBRACKET##', '##ENDBRACKET##', '##STARTSBRACKET##', '##ENDSBRACKET##'), $uncompressed); // parse streams // the following code ignores the keyword "BT" and "ET" if they are in a string $isTextObj = false; $textObject = ''; $openBracketCount = 0; foreach (preg_split('~(\\s*BT\\s+)|(\\s+ET\\s+)|(\\()|(\\))~ism', $text, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY) as $k => $part) { $switch = trim($part); switch ($switch) { case '(': if ($isTextObj) { $openBracketCount++; } break; case ')': if ($isTextObj) { $openBracketCount--; } break; case 'ET': if ($isTextObj and $openBracketCount <= 0) { $isTextObj = false; $textObjects[] = $textObject; $textObject = ''; } break; } if ($isTextObj) { $textObject .= $part; } if ($switch == 'BT') { if ($isTextObj) { $textObject .= $part; } else { $isTextObj = true; } } } } $return = ''; foreach ($textObjects as $textObject) { // parse text-objects // the following code ignores PDF-keywords if they are in a string $isString = false; $openBracketCount = 0; foreach (preg_split('~(?:\\s+(Td|TD|T\\*|"|\')\\s+)|(\\()|(\\))~ism', $textObject, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY) as $k => $part) { switch ($part) { // new line case 'Td': case 'TD': case 'T*': case '"': case "'": if (!$isString) { $return .= "\n"; } break; case ')': if ($isString and $openBracketCount <= 0) { $isString = false; $return .= $string; $string = ''; } elseif ($isString) { $openBracketCount--; } break; } if ($isString) { $string .= $part; } if ($part == '(') { if ($isString) { $openBracketCount++; } else { $isString = true; } } } $return .= "\n"; } // substitute the placeholders for the brackets and escape sequences $convert = array('##STARTBRACKET##' => '(', '##ENDBRACKET##' => ')', '##STARTSBRACKET##' => '[', '##ENDSBRACKET##' => ']', "\\\n" => "\n", "\\\r" => "\n", "\\\n\r" => "\n", "\\\t" => "\t", "\\\\b" => "\\b", "\\\f" => "\f", '\\\\' => '\\'); // replace octal character codes $text = preg_replace_callback('~\\\\([0-8]{3})~', create_function('$matches', ' if(octdec($matches[1]) > 32) return utf8_encode(chr(octdec($matches[1]))); else return ""; '), $return); // execute conversion with $convert $text = strtr($text, $convert); if (false !== $this->dest) { // store $text into the specified destination file // and return true on success or false on error return false !== file_put_contents($this->dest); } else { // return $text return $text; } } }