PdfPig/src/UglyToad.Pdf/Parser/BaseParser.cs
2017-11-09 19:22:52 +00:00

942 lines
32 KiB
C#

namespace UglyToad.Pdf.Parser
{
using System;
using System.IO;
using System.Text;
using Cos;
using IO;
using Util;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* This class is used to contain parsing logic that will be used by both the
* PDFParser and the COSStreamParser.
*
* @author Ben Litchfield
*/
public abstract class BaseParser
{
private static readonly long OBJECT_NUMBER_THRESHOLD = 10000000000L;
private static readonly long GENERATION_NUMBER_THRESHOLD = 65535;
static readonly int MAX_LENGTH_LONG = long.MaxValue.ToString().Length;
/**
* Log instance.
*/
protected static readonly int E = 'e';
protected static readonly int N = 'n';
protected static readonly int D = 'd';
protected static readonly int S = 's';
protected static readonly int T = 't';
protected static readonly int R = 'r';
protected static readonly int A = 'a';
protected static readonly int M = 'm';
protected static readonly int O = 'o';
protected static readonly int B = 'b';
protected static readonly int J = 'j';
/**
* This is a string constant that will be used for comparisons.
*/
public static readonly string DEF = "def";
/**
* This is a string constant that will be used for comparisons.
*/
protected static readonly string ENDOBJ_string = "endobj";
/**
* This is a string constant that will be used for comparisons.
*/
protected static readonly string ENDSTREAM_string = "endstream";
/**
* This is a string constant that will be used for comparisons.
*/
protected static readonly string STREAM_string = "stream";
/**
* This is a string constant that will be used for comparisons.
*/
private static readonly string TRUE = "true";
/**
* This is a string constant that will be used for comparisons.
*/
private static readonly string FALSE = "false";
/**
* This is a string constant that will be used for comparisons.
*/
private static readonly string NULL = "null";
/**
* ASCII code for line feed.
*/
protected static readonly byte ASCII_LF = 10;
/**
* ASCII code for carriage return.
*/
protected static readonly byte ASCII_CR = 13;
private static readonly byte ASCII_ZERO = 48;
private static readonly byte ASCII_NINE = 57;
private static readonly byte ASCII_SPACE = 32;
/**
* This is the stream that will be read from.
*/
protected readonly SequentialSource seqSource;
/**
* This is the document that will be parsed.
*/
protected COSDocument document;
/**
* Default constructor.
*/
public BaseParser(SequentialSource pdfSource)
{
this.seqSource = pdfSource;
}
private static bool isHexDigit(char ch)
{
return char.IsDigit(ch) || (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F');
}
protected void skipWhiteSpaces()
{
//PDF Ref 3.2.7 A stream must be followed by either
//a CRLF or LF but nothing else.
int whitespace = seqSource.read();
//see brother_scan_cover.pdf, it adds whitespaces
//after the stream but before the start of the
//data, so just read those first
while (ASCII_SPACE == whitespace)
{
whitespace = seqSource.read();
}
if (ASCII_CR == whitespace)
{
whitespace = seqSource.read();
if (ASCII_LF != whitespace)
{
seqSource.unread(whitespace);
//The spec says this is invalid but it happens in the real
//world so we must support it.
}
}
else if (ASCII_LF != whitespace)
{
//we are in an error.
//but again we will do a lenient parsing and just assume that everything
//is fine
seqSource.unread(whitespace);
}
}
/**
* This is really a bug in the Document creators code, but it caused a crash in PDFBox, the first bug was in this
* format: /Title ( (5) /Creator which was patched in 1 place.
*
* However it missed the case where the number of opening and closing parenthesis isn't balanced
*
* The second bug was in this format /Title (c:\) /Producer
*
* This patch moves this code out of the parseCOSstring method, so it can be used twice.
*
* @param bracesParameter the number of braces currently open.
*
* @return the corrected value of the brace counter
* @throws IOException
*/
private int checkForEndOfstring(int bracesParameter)
{
int braces = bracesParameter;
byte[]
nextThreeBytes = new byte[3];
int amountRead = seqSource.read(nextThreeBytes);
// Check the next 3 bytes if available
// The following cases are valid indicators for the end of the string
// 1. Next line contains another COSObject: CR + LF + '/'
// 2. CosDictionary ends in the next line: CR + LF + '>'
// 3. Next line contains another COSObject: CR + '/'
// 4. CosDictionary ends in the next line: CR + '>'
if (amountRead == 3 && nextThreeBytes[0] == ASCII_CR)
{
if ((nextThreeBytes[1] == ASCII_LF && (nextThreeBytes[2] == '/') || nextThreeBytes[2] == '>')
|| nextThreeBytes[1] == '/' || nextThreeBytes[1] == '>')
{
braces = 0;
}
}
if (amountRead > 0)
{
seqSource.unread(nextThreeBytes, 0, amountRead);
}
return braces;
}
/**
* This will parse a PDF string.
*
* @return The parsed PDF string.
*
* @throws IOException If there is an error reading from the stream.
*/
protected CosString parseCOSstring()
{
char nextChar = (char)seqSource.read();
if (nextChar == '<')
{
return parseCOSHexstring();
}
else if (nextChar != '(')
{
throw new IOException("parseCOSstring string should start with '(' or '<' and not '" +
nextChar + "' " + seqSource);
}
var charLf = (char)ASCII_LF;
using (var memoryStream = new MemoryStream())
using (var writer = new StreamWriter(memoryStream))
{
// This is the number of braces read
int braces = 1;
int c = seqSource.read();
while (braces > 0 && c != -1)
{
char ch = (char)c;
int nextc = -2; // not yet read
if (ch == ')')
{
braces--;
braces = checkForEndOfstring(braces);
if (braces != 0)
{
writer.Write(ch);
}
}
else if (ch == '(')
{
braces++;
writer.Write(ch);
}
else if (ch == '\\')
{
//patched by ram
char next = (char)seqSource.read();
switch (next)
{
case 'n':
writer.Write('\n');
break;
case 'r':
writer.Write('\r');
break;
case 't':
writer.Write('\t');
break;
case 'b':
writer.Write('\b');
break;
case 'f':
writer.Write('\f');
break;
case ')':
// PDFBox 276 /Title (c:\)
braces = checkForEndOfstring(braces);
if (braces != 0)
{
writer.Write(next);
}
else
{
writer.Write('\\');
}
break;
case '(':
case '\\':
writer.Write(next);
break;
//case charLf:
// case ASCII_CR:
//this is a break in the line so ignore it and the newline and continue
c = seqSource.read();
while (isEOL(c) && c != -1)
{
c = seqSource.read();
}
nextc = c;
break;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
{
var octal = new StringBuilder();
octal.Append(next);
c = seqSource.read();
char digit = (char)c;
if (digit >= '0' && digit <= '7')
{
octal.Append(digit);
c = seqSource.read();
digit = (char)c;
if (digit >= '0' && digit <= '7')
{
octal.Append(digit);
}
else
{
nextc = c;
}
}
else
{
nextc = c;
}
int character = 0;
try
{
character = Convert.ToInt32(octal.ToString(), 8);
}
catch (FormatException e)
{
throw new IOException("Error: Expected octal character, actual='" + octal + "'", e);
}
writer.Write(character);
break;
}
default:
// dropping the backslash
// see 7.3.4.2 Literal strings for further information
writer.Write(next);
break;
}
}
else
{
writer.Write(ch);
}
if (nextc != -2)
{
c = nextc;
}
else
{
c = seqSource.read();
}
}
if (c != -1)
{
seqSource.unread(c);
}
return new CosString(memoryStream.ToArray());
}
}
/**
* This will parse a PDF HEX string with fail fast semantic
* meaning that we stop if a not allowed character is found.
* This is necessary in order to detect malformed input and
* be able to skip to next object start.
*
* We assume starting '&lt;' was already read.
*
* @return The parsed PDF string.
*
* @throws IOException If there is an error reading from the stream.
*/
private CosString parseCOSHexstring()
{
var sBuf = new StringBuilder();
while (true)
{
int c = seqSource.read();
if (isHexDigit((char)c))
{
sBuf.Append((char)c);
}
else if (c == '>')
{
break;
}
else if (c < 0)
{
throw new IOException("Missing closing bracket for hex string. Reached EOS.");
}
else if ((c == ' ') || (c == '\n') ||
(c == '\t') || (c == '\r') ||
(c == '\b') || (c == '\f'))
{
continue;
}
else
{
// if invalid chars was found: discard last
// hex character if it is not part of a pair
if (sBuf.Length % 2 != 0)
{
sBuf.Remove(sBuf.Length - 1, 1);
}
// read till the closing bracket was found
do
{
c = seqSource.read();
}
while (c != '>' && c >= 0);
// might have reached EOF while looking for the closing bracket
// this can happen for malformed PDFs only. Make sure that there is
// no endless loop.
if (c < 0)
{
throw new IOException("Missing closing bracket for hex string. Reached EOS.");
}
// exit loop
break;
}
}
return CosString.ParseHex(sBuf.ToString());
}
/**
* Determine if a character terminates a PDF name.
*
* @param ch The character
* @return true if the character terminates a PDF name, otherwise false.
*/
protected bool isEndOfName(int ch)
{
return ch == ASCII_SPACE || ch == ASCII_CR || ch == ASCII_LF || ch == 9 || ch == '>' ||
ch == '<' || ch == '[' || ch == '/' || ch == ']' || ch == ')' || ch == '(' ||
ch == 0 || ch == '\f';
}
/**
* Returns true if a byte sequence is valid UTF-8.
*/
private bool isValidUTF8(byte[] input)
{
try
{
Decoder d = Encoding.UTF8.GetDecoder();
var charLength = d.GetCharCount(input, 0, input.Length);
var chars = new char[charLength];
d.Convert(input, 0, input.Length, chars, 0, charLength, true, out _, out _, out _);
return true;
}
catch (Exception e)
{
return false;
}
}
/**
* This will parse a bool object from the stream.
*
* @return The parsed bool object.
*
* @throws IOException If an IO error occurs during parsing.
*/
protected CosBoolean parsebool()
{
CosBoolean retval = null;
char c = (char)seqSource.peek();
if (c == 't')
{
string truestring = OtherEncodings.BytesAsLatin1String(seqSource.readFully(4));
if (!truestring.Equals(TRUE))
{
throw new IOException("Error parsing bool: expected='true' actual='" + truestring
+ "' at offset " + seqSource.getPosition());
}
else
{
retval = CosBoolean.True;
}
}
else if (c == 'f')
{
string falsestring = OtherEncodings.BytesAsLatin1String(seqSource.readFully(5));
if (!falsestring.Equals(FALSE))
{
throw new IOException("Error parsing bool: expected='true' actual='" + falsestring
+ "' at offset " + seqSource.getPosition());
}
else
{
retval = CosBoolean.False;
}
}
else
{
throw new IOException("Error parsing bool expected='t or f' actual='" + c
+ "' at offset " + seqSource.getPosition());
}
return retval;
}
/**
* This will read the next string from the stream.
*
* @return The string that was read from the stream, never null.
*
* @throws IOException If there is an error reading from the stream.
*/
protected string readstring()
{
SkipSpaces();
StringBuilder buffer = new StringBuilder();
int c = seqSource.read();
while (!isEndOfName((char)c) && c != -1)
{
buffer.Append((char)c);
c = seqSource.read();
}
if (c != -1)
{
seqSource.unread(c);
}
return buffer.ToString();
}
/**
* Read one string and throw an exception if it is not the expected value.
*
* @param expectedstring the string value that is expected.
* @throws IOException if the string char is not the expected value or if an
* I/O error occurs.
*/
protected void readExpectedstring(string expectedstring)
{
readExpectedstring(expectedstring, false);
}
/**
* Reads given pattern from {@link #seqSource}. Skipping whitespace at start and end if wanted.
*
* @param expectedstring pattern to be skipped
* @param skipSpaces if set to true spaces before and after the string will be skipped
* @throws IOException if pattern could not be read
*/
protected void readExpectedstring(string expectedstring, bool skipSpaces)
{
SkipSpaces();
foreach (var c in expectedstring)
{
if (seqSource.read() != c)
{
throw new IOException("Expected string '" + expectedstring
+ "' but missed at character '" + c + "' at offset "
+ seqSource.getPosition());
}
}
SkipSpaces();
}
/**
* Read one char and throw an exception if it is not the expected value.
*
* @param ec the char value that is expected.
* @throws IOException if the read char is not the expected value or if an
* I/O error occurs.
*/
protected void readExpectedChar(char ec)
{
char c = (char)seqSource.read();
if (c != ec)
{
throw new IOException("expected='" + ec + "' actual='" + c + "' at offset " + seqSource.getPosition());
}
}
/**
* This will read the next string from the stream up to a certain length.
*
* @param length The length to stop reading at.
*
* @return The string that was read from the stream of length 0 to length.
*
* @throws IOException If there is an error reading from the stream.
*/
protected string readstring(int length)
{
SkipSpaces();
int c = seqSource.read();
//average string size is around 2 and the normal string buffer size is
//about 16 so lets save some space.
StringBuilder buffer = new StringBuilder(length);
while (!isWhitespace(c) && !isClosing(c) && c != -1 && buffer.Length < length &&
c != '[' &&
c != '<' &&
c != '(' &&
c != '/')
{
buffer.Append((char)c);
c = seqSource.read();
}
if (c != -1)
{
seqSource.unread(c);
}
return buffer.ToString();
}
/**
* This will tell if the next character is a closing brace( close of PDF array ).
*
* @return true if the next byte is ']', false otherwise.
*
* @throws IOException If an IO error occurs.
*/
protected bool isClosing()
{
return isClosing(seqSource.peek());
}
/**
* This will tell if the next character is a closing brace( close of PDF array ).
*
* @param c The character to check against end of line
* @return true if the next byte is ']', false otherwise.
*/
protected bool isClosing(int c)
{
return c == ']';
}
/**
* This will read bytes until the first end of line marker occurs.
* NOTE: The EOL marker may consists of 1 (CR or LF) or 2 (CR and CL) bytes
* which is an important detail if one wants to unread the line.
*
* @return The characters between the current position and the end of the line.
*
* @throws IOException If there is an error reading from the stream.
*/
protected string readLine()
{
if (seqSource.isEOF())
{
throw new IOException("Error: End-of-File, expected line");
}
StringBuilder buffer = new StringBuilder(11);
int c;
while ((c = seqSource.read()) != -1)
{
// CR and LF are valid EOLs
if (isEOL(c))
{
break;
}
buffer.Append((char)c);
}
// CR+LF is also a valid EOL
if (isCR(c) && isLF(seqSource.peek()))
{
seqSource.read();
}
return buffer.ToString();
}
/**
* This will tell if the next byte to be read is an end of line byte.
*
* @return true if the next byte is 0x0A or 0x0D.
*
* @throws IOException If there is an error reading from the stream.
*/
protected bool isEOL()
{
return isEOL(seqSource.peek());
}
/**
* This will tell if the next byte to be read is an end of line byte.
*
* @param c The character to check against end of line
* @return true if the next byte is 0x0A or 0x0D.
*/
protected bool isEOL(int c)
{
return isLF(c) || isCR(c);
}
private bool isLF(int c)
{
return ASCII_LF == c;
}
private bool isCR(int c)
{
return ASCII_CR == c;
}
/**
* This will tell if the next byte is whitespace or not.
*
* @return true if the next byte in the stream is a whitespace character.
*
* @throws IOException If there is an error reading from the stream.
*/
protected bool isWhitespace()
{
return isWhitespace(seqSource.peek());
}
/**
* This will tell if a character is whitespace or not. These values are
* specified in table 1 (page 12) of ISO 32000-1:2008.
* @param c The character to check against whitespace
* @return true if the character is a whitespace character.
*/
protected bool isWhitespace(int c)
{
return c == 0 || c == 9 || c == 12 || c == ASCII_LF
|| c == ASCII_CR || c == ASCII_SPACE;
}
/**
* This will tell if the next byte is a space or not.
*
* @return true if the next byte in the stream is a space character.
*
* @throws IOException If there is an error reading from the stream.
*/
protected bool isSpace()
{
return isSpace(seqSource.peek());
}
/**
* This will tell if the given value is a space or not.
*
* @param c The character to check against space
* @return true if the next byte in the stream is a space character.
*/
protected bool isSpace(int c)
{
return ASCII_SPACE == c;
}
/**
* This will tell if the next byte is a digit or not.
*
* @return true if the next byte in the stream is a digit.
*
* @throws IOException If there is an error reading from the stream.
*/
protected bool isDigit()
{
return isDigit(seqSource.peek());
}
/**
* This will tell if the given value is a digit or not.
*
* @param c The character to be checked
* @return true if the next byte in the stream is a digit.
*/
protected static bool isDigit(int c)
{
return c >= ASCII_ZERO && c <= ASCII_NINE;
}
/**
* This will skip all spaces and comments that are present.
*
* @throws IOException If there is an error reading from the stream.
*/
protected void SkipSpaces()
{
int c = seqSource.read();
// 37 is the % character, a comment
while (isWhitespace(c) || c == 37)
{
if (c == 37)
{
// skip past the comment section
c = seqSource.read();
while (!isEOL(c) && c != -1)
{
c = seqSource.read();
}
}
else
{
c = seqSource.read();
}
}
if (c != -1)
{
seqSource.unread(c);
}
}
/**
* This will read a long from the Stream and throw an {@link IOException} if
* the long value is negative or has more than 10 digits (i.e. : bigger than
* {@link #OBJECT_NUMBER_THRESHOLD})
*
* @return the object number being read.
* @throws IOException if an I/O error occurs
*/
protected long readObjectNumber()
{
long retval = readLong();
if (retval < 0 || retval >= OBJECT_NUMBER_THRESHOLD)
{
throw new IOException("Object Number '" + retval + "' has more than 10 digits or is negative");
}
return retval;
}
/**
* This will read a integer from the Stream and throw an {@link IllegalArgumentException} if the integer value
* has more than the maximum object revision (i.e. : bigger than {@link #GENERATION_NUMBER_THRESHOLD})
* @return the generation number being read.
* @throws IOException if an I/O error occurs
*/
protected int readGenerationNumber()
{
int retval = readInt();
if (retval < 0 || retval > GENERATION_NUMBER_THRESHOLD)
{
throw new IOException("Generation Number '" + retval + "' has more than 5 digits");
}
return retval;
}
/**
* This will read an integer from the stream.
*
* @return The integer that was read from the stream.
*
* @throws IOException If there is an error reading from the stream.
*/
protected int readInt()
{
SkipSpaces();
int retval = 0;
StringBuilder intBuffer = readstringNumber();
try
{
retval = int.Parse(intBuffer.ToString());
}
catch (FormatException e)
{
seqSource.unread(OtherEncodings.StringAsLatin1Bytes(intBuffer.ToString()));
throw new IOException("Error: Expected an integer type at offset " + seqSource.getPosition(), e);
}
return retval;
}
/**
* This will read an long from the stream.
*
* @return The long that was read from the stream.
*
* @throws IOException If there is an error reading from the stream.
*/
protected long readLong()
{
SkipSpaces();
long retval = 0;
StringBuilder longBuffer = readstringNumber();
try
{
retval = long.Parse(longBuffer.ToString());
}
catch (FormatException e)
{
seqSource.unread(OtherEncodings.StringAsLatin1Bytes(longBuffer.ToString()));
throw new IOException(
$"Error: Expected a long type at offset {seqSource.getPosition()}, instead got \'{longBuffer}\'", e);
}
return retval;
}
/**
* This method is used to read a token by the {@linkplain #readInt()} method
* and the {@linkplain #readLong()} method.
*
* @return the token to parse as integer or long by the calling method.
* @throws IOException throws by the {@link #seqSource} methods.
*/
protected StringBuilder readstringNumber()
{
int lastByte = 0;
StringBuilder buffer = new StringBuilder();
while ((lastByte = seqSource.read()) != ASCII_SPACE &&
lastByte != ASCII_LF &&
lastByte != ASCII_CR &&
lastByte != 60 && //see sourceforge bug 1714707
lastByte != '[' && // PDFBOX-1845
lastByte != '(' && // PDFBOX-2579
lastByte != 0 && //See sourceforge bug 853328
lastByte != -1)
{
buffer.Append((char)lastByte);
if (buffer.Length > MAX_LENGTH_LONG)
{
throw new IOException("Number '" + buffer +
"' is getting too long, stop reading at offset " + seqSource.getPosition());
}
}
if (lastByte != -1)
{
seqSource.unread(lastByte);
}
return buffer;
}
}
}