namespace UglyToad.Pdf.Parser { using System; using System.IO; using System.Text; using Cos; using IO; using Util; /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * This class is used to contain parsing logic that will be used by both the * PDFParser and the COSStreamParser. * * @author Ben Litchfield */ public abstract class BaseParser { private static readonly long OBJECT_NUMBER_THRESHOLD = 10000000000L; private static readonly long GENERATION_NUMBER_THRESHOLD = 65535; static readonly int MAX_LENGTH_LONG = long.MaxValue.ToString().Length; /** * Log instance. */ protected static readonly int E = 'e'; protected static readonly int N = 'n'; protected static readonly int D = 'd'; protected static readonly int S = 's'; protected static readonly int T = 't'; protected static readonly int R = 'r'; protected static readonly int A = 'a'; protected static readonly int M = 'm'; protected static readonly int O = 'o'; protected static readonly int B = 'b'; protected static readonly int J = 'j'; /** * This is a string constant that will be used for comparisons. */ public static readonly string DEF = "def"; /** * This is a string constant that will be used for comparisons. */ protected static readonly string ENDOBJ_string = "endobj"; /** * This is a string constant that will be used for comparisons. */ protected static readonly string ENDSTREAM_string = "endstream"; /** * This is a string constant that will be used for comparisons. */ protected static readonly string STREAM_string = "stream"; /** * This is a string constant that will be used for comparisons. */ private static readonly string TRUE = "true"; /** * This is a string constant that will be used for comparisons. */ private static readonly string FALSE = "false"; /** * This is a string constant that will be used for comparisons. */ private static readonly string NULL = "null"; /** * ASCII code for line feed. */ protected static readonly byte ASCII_LF = 10; /** * ASCII code for carriage return. */ protected static readonly byte ASCII_CR = 13; private static readonly byte ASCII_ZERO = 48; private static readonly byte ASCII_NINE = 57; private static readonly byte ASCII_SPACE = 32; /** * This is the stream that will be read from. */ protected readonly SequentialSource seqSource; /** * This is the document that will be parsed. */ protected COSDocument document; /** * Default constructor. */ public BaseParser(SequentialSource pdfSource) { this.seqSource = pdfSource; } private static bool isHexDigit(char ch) { return char.IsDigit(ch) || (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F'); } protected void skipWhiteSpaces() { //PDF Ref 3.2.7 A stream must be followed by either //a CRLF or LF but nothing else. int whitespace = seqSource.read(); //see brother_scan_cover.pdf, it adds whitespaces //after the stream but before the start of the //data, so just read those first while (ASCII_SPACE == whitespace) { whitespace = seqSource.read(); } if (ASCII_CR == whitespace) { whitespace = seqSource.read(); if (ASCII_LF != whitespace) { seqSource.unread(whitespace); //The spec says this is invalid but it happens in the real //world so we must support it. } } else if (ASCII_LF != whitespace) { //we are in an error. //but again we will do a lenient parsing and just assume that everything //is fine seqSource.unread(whitespace); } } /** * This is really a bug in the Document creators code, but it caused a crash in PDFBox, the first bug was in this * format: /Title ( (5) /Creator which was patched in 1 place. * * However it missed the case where the number of opening and closing parenthesis isn't balanced * * The second bug was in this format /Title (c:\) /Producer * * This patch moves this code out of the parseCOSstring method, so it can be used twice. * * @param bracesParameter the number of braces currently open. * * @return the corrected value of the brace counter * @throws IOException */ private int checkForEndOfstring(int bracesParameter) { int braces = bracesParameter; byte[] nextThreeBytes = new byte[3]; int amountRead = seqSource.read(nextThreeBytes); // Check the next 3 bytes if available // The following cases are valid indicators for the end of the string // 1. Next line contains another COSObject: CR + LF + '/' // 2. CosDictionary ends in the next line: CR + LF + '>' // 3. Next line contains another COSObject: CR + '/' // 4. CosDictionary ends in the next line: CR + '>' if (amountRead == 3 && nextThreeBytes[0] == ASCII_CR) { if ((nextThreeBytes[1] == ASCII_LF && (nextThreeBytes[2] == '/') || nextThreeBytes[2] == '>') || nextThreeBytes[1] == '/' || nextThreeBytes[1] == '>') { braces = 0; } } if (amountRead > 0) { seqSource.unread(nextThreeBytes, 0, amountRead); } return braces; } /** * This will parse a PDF string. * * @return The parsed PDF string. * * @throws IOException If there is an error reading from the stream. */ protected CosString parseCOSstring() { char nextChar = (char)seqSource.read(); if (nextChar == '<') { return parseCOSHexstring(); } else if (nextChar != '(') { throw new IOException("parseCOSstring string should start with '(' or '<' and not '" + nextChar + "' " + seqSource); } var charLf = (char)ASCII_LF; using (var memoryStream = new MemoryStream()) using (var writer = new StreamWriter(memoryStream)) { // This is the number of braces read int braces = 1; int c = seqSource.read(); while (braces > 0 && c != -1) { char ch = (char)c; int nextc = -2; // not yet read if (ch == ')') { braces--; braces = checkForEndOfstring(braces); if (braces != 0) { writer.Write(ch); } } else if (ch == '(') { braces++; writer.Write(ch); } else if (ch == '\\') { //patched by ram char next = (char)seqSource.read(); switch (next) { case 'n': writer.Write('\n'); break; case 'r': writer.Write('\r'); break; case 't': writer.Write('\t'); break; case 'b': writer.Write('\b'); break; case 'f': writer.Write('\f'); break; case ')': // PDFBox 276 /Title (c:\) braces = checkForEndOfstring(braces); if (braces != 0) { writer.Write(next); } else { writer.Write('\\'); } break; case '(': case '\\': writer.Write(next); break; //case charLf: // case ASCII_CR: //this is a break in the line so ignore it and the newline and continue c = seqSource.read(); while (isEOL(c) && c != -1) { c = seqSource.read(); } nextc = c; break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': { var octal = new StringBuilder(); octal.Append(next); c = seqSource.read(); char digit = (char)c; if (digit >= '0' && digit <= '7') { octal.Append(digit); c = seqSource.read(); digit = (char)c; if (digit >= '0' && digit <= '7') { octal.Append(digit); } else { nextc = c; } } else { nextc = c; } int character = 0; try { character = Convert.ToInt32(octal.ToString(), 8); } catch (FormatException e) { throw new IOException("Error: Expected octal character, actual='" + octal + "'", e); } writer.Write(character); break; } default: // dropping the backslash // see 7.3.4.2 Literal strings for further information writer.Write(next); break; } } else { writer.Write(ch); } if (nextc != -2) { c = nextc; } else { c = seqSource.read(); } } if (c != -1) { seqSource.unread(c); } return new CosString(memoryStream.ToArray()); } } /** * This will parse a PDF HEX string with fail fast semantic * meaning that we stop if a not allowed character is found. * This is necessary in order to detect malformed input and * be able to skip to next object start. * * We assume starting '<' was already read. * * @return The parsed PDF string. * * @throws IOException If there is an error reading from the stream. */ private CosString parseCOSHexstring() { var sBuf = new StringBuilder(); while (true) { int c = seqSource.read(); if (isHexDigit((char)c)) { sBuf.Append((char)c); } else if (c == '>') { break; } else if (c < 0) { throw new IOException("Missing closing bracket for hex string. Reached EOS."); } else if ((c == ' ') || (c == '\n') || (c == '\t') || (c == '\r') || (c == '\b') || (c == '\f')) { continue; } else { // if invalid chars was found: discard last // hex character if it is not part of a pair if (sBuf.Length % 2 != 0) { sBuf.Remove(sBuf.Length - 1, 1); } // read till the closing bracket was found do { c = seqSource.read(); } while (c != '>' && c >= 0); // might have reached EOF while looking for the closing bracket // this can happen for malformed PDFs only. Make sure that there is // no endless loop. if (c < 0) { throw new IOException("Missing closing bracket for hex string. Reached EOS."); } // exit loop break; } } return CosString.ParseHex(sBuf.ToString()); } /** * Determine if a character terminates a PDF name. * * @param ch The character * @return true if the character terminates a PDF name, otherwise false. */ protected bool isEndOfName(int ch) { return ch == ASCII_SPACE || ch == ASCII_CR || ch == ASCII_LF || ch == 9 || ch == '>' || ch == '<' || ch == '[' || ch == '/' || ch == ']' || ch == ')' || ch == '(' || ch == 0 || ch == '\f'; } /** * Returns true if a byte sequence is valid UTF-8. */ private bool isValidUTF8(byte[] input) { try { Decoder d = Encoding.UTF8.GetDecoder(); var charLength = d.GetCharCount(input, 0, input.Length); var chars = new char[charLength]; d.Convert(input, 0, input.Length, chars, 0, charLength, true, out _, out _, out _); return true; } catch (Exception e) { return false; } } /** * This will parse a bool object from the stream. * * @return The parsed bool object. * * @throws IOException If an IO error occurs during parsing. */ protected CosBoolean parsebool() { CosBoolean retval = null; char c = (char)seqSource.peek(); if (c == 't') { string truestring = OtherEncodings.BytesAsLatin1String(seqSource.readFully(4)); if (!truestring.Equals(TRUE)) { throw new IOException("Error parsing bool: expected='true' actual='" + truestring + "' at offset " + seqSource.getPosition()); } else { retval = CosBoolean.True; } } else if (c == 'f') { string falsestring = OtherEncodings.BytesAsLatin1String(seqSource.readFully(5)); if (!falsestring.Equals(FALSE)) { throw new IOException("Error parsing bool: expected='true' actual='" + falsestring + "' at offset " + seqSource.getPosition()); } else { retval = CosBoolean.False; } } else { throw new IOException("Error parsing bool expected='t or f' actual='" + c + "' at offset " + seqSource.getPosition()); } return retval; } /** * This will read the next string from the stream. * * @return The string that was read from the stream, never null. * * @throws IOException If there is an error reading from the stream. */ protected string readstring() { SkipSpaces(); StringBuilder buffer = new StringBuilder(); int c = seqSource.read(); while (!isEndOfName((char)c) && c != -1) { buffer.Append((char)c); c = seqSource.read(); } if (c != -1) { seqSource.unread(c); } return buffer.ToString(); } /** * Read one string and throw an exception if it is not the expected value. * * @param expectedstring the string value that is expected. * @throws IOException if the string char is not the expected value or if an * I/O error occurs. */ protected void readExpectedstring(string expectedstring) { readExpectedstring(expectedstring, false); } /** * Reads given pattern from {@link #seqSource}. Skipping whitespace at start and end if wanted. * * @param expectedstring pattern to be skipped * @param skipSpaces if set to true spaces before and after the string will be skipped * @throws IOException if pattern could not be read */ protected void readExpectedstring(string expectedstring, bool skipSpaces) { SkipSpaces(); foreach (var c in expectedstring) { if (seqSource.read() != c) { throw new IOException("Expected string '" + expectedstring + "' but missed at character '" + c + "' at offset " + seqSource.getPosition()); } } SkipSpaces(); } /** * Read one char and throw an exception if it is not the expected value. * * @param ec the char value that is expected. * @throws IOException if the read char is not the expected value or if an * I/O error occurs. */ protected void readExpectedChar(char ec) { char c = (char)seqSource.read(); if (c != ec) { throw new IOException("expected='" + ec + "' actual='" + c + "' at offset " + seqSource.getPosition()); } } /** * This will read the next string from the stream up to a certain length. * * @param length The length to stop reading at. * * @return The string that was read from the stream of length 0 to length. * * @throws IOException If there is an error reading from the stream. */ protected string readstring(int length) { SkipSpaces(); int c = seqSource.read(); //average string size is around 2 and the normal string buffer size is //about 16 so lets save some space. StringBuilder buffer = new StringBuilder(length); while (!isWhitespace(c) && !isClosing(c) && c != -1 && buffer.Length < length && c != '[' && c != '<' && c != '(' && c != '/') { buffer.Append((char)c); c = seqSource.read(); } if (c != -1) { seqSource.unread(c); } return buffer.ToString(); } /** * This will tell if the next character is a closing brace( close of PDF array ). * * @return true if the next byte is ']', false otherwise. * * @throws IOException If an IO error occurs. */ protected bool isClosing() { return isClosing(seqSource.peek()); } /** * This will tell if the next character is a closing brace( close of PDF array ). * * @param c The character to check against end of line * @return true if the next byte is ']', false otherwise. */ protected bool isClosing(int c) { return c == ']'; } /** * This will read bytes until the first end of line marker occurs. * NOTE: The EOL marker may consists of 1 (CR or LF) or 2 (CR and CL) bytes * which is an important detail if one wants to unread the line. * * @return The characters between the current position and the end of the line. * * @throws IOException If there is an error reading from the stream. */ protected string readLine() { if (seqSource.isEOF()) { throw new IOException("Error: End-of-File, expected line"); } StringBuilder buffer = new StringBuilder(11); int c; while ((c = seqSource.read()) != -1) { // CR and LF are valid EOLs if (isEOL(c)) { break; } buffer.Append((char)c); } // CR+LF is also a valid EOL if (isCR(c) && isLF(seqSource.peek())) { seqSource.read(); } return buffer.ToString(); } /** * This will tell if the next byte to be read is an end of line byte. * * @return true if the next byte is 0x0A or 0x0D. * * @throws IOException If there is an error reading from the stream. */ protected bool isEOL() { return isEOL(seqSource.peek()); } /** * This will tell if the next byte to be read is an end of line byte. * * @param c The character to check against end of line * @return true if the next byte is 0x0A or 0x0D. */ protected bool isEOL(int c) { return isLF(c) || isCR(c); } private bool isLF(int c) { return ASCII_LF == c; } private bool isCR(int c) { return ASCII_CR == c; } /** * This will tell if the next byte is whitespace or not. * * @return true if the next byte in the stream is a whitespace character. * * @throws IOException If there is an error reading from the stream. */ protected bool isWhitespace() { return isWhitespace(seqSource.peek()); } /** * This will tell if a character is whitespace or not. These values are * specified in table 1 (page 12) of ISO 32000-1:2008. * @param c The character to check against whitespace * @return true if the character is a whitespace character. */ protected bool isWhitespace(int c) { return c == 0 || c == 9 || c == 12 || c == ASCII_LF || c == ASCII_CR || c == ASCII_SPACE; } /** * This will tell if the next byte is a space or not. * * @return true if the next byte in the stream is a space character. * * @throws IOException If there is an error reading from the stream. */ protected bool isSpace() { return isSpace(seqSource.peek()); } /** * This will tell if the given value is a space or not. * * @param c The character to check against space * @return true if the next byte in the stream is a space character. */ protected bool isSpace(int c) { return ASCII_SPACE == c; } /** * This will tell if the next byte is a digit or not. * * @return true if the next byte in the stream is a digit. * * @throws IOException If there is an error reading from the stream. */ protected bool isDigit() { return isDigit(seqSource.peek()); } /** * This will tell if the given value is a digit or not. * * @param c The character to be checked * @return true if the next byte in the stream is a digit. */ protected static bool isDigit(int c) { return c >= ASCII_ZERO && c <= ASCII_NINE; } /** * This will skip all spaces and comments that are present. * * @throws IOException If there is an error reading from the stream. */ protected void SkipSpaces() { int c = seqSource.read(); // 37 is the % character, a comment while (isWhitespace(c) || c == 37) { if (c == 37) { // skip past the comment section c = seqSource.read(); while (!isEOL(c) && c != -1) { c = seqSource.read(); } } else { c = seqSource.read(); } } if (c != -1) { seqSource.unread(c); } } /** * This will read a long from the Stream and throw an {@link IOException} if * the long value is negative or has more than 10 digits (i.e. : bigger than * {@link #OBJECT_NUMBER_THRESHOLD}) * * @return the object number being read. * @throws IOException if an I/O error occurs */ protected long readObjectNumber() { long retval = readLong(); if (retval < 0 || retval >= OBJECT_NUMBER_THRESHOLD) { throw new IOException("Object Number '" + retval + "' has more than 10 digits or is negative"); } return retval; } /** * This will read a integer from the Stream and throw an {@link IllegalArgumentException} if the integer value * has more than the maximum object revision (i.e. : bigger than {@link #GENERATION_NUMBER_THRESHOLD}) * @return the generation number being read. * @throws IOException if an I/O error occurs */ protected int readGenerationNumber() { int retval = readInt(); if (retval < 0 || retval > GENERATION_NUMBER_THRESHOLD) { throw new IOException("Generation Number '" + retval + "' has more than 5 digits"); } return retval; } /** * This will read an integer from the stream. * * @return The integer that was read from the stream. * * @throws IOException If there is an error reading from the stream. */ protected int readInt() { SkipSpaces(); int retval = 0; StringBuilder intBuffer = readstringNumber(); try { retval = int.Parse(intBuffer.ToString()); } catch (FormatException e) { seqSource.unread(OtherEncodings.StringAsLatin1Bytes(intBuffer.ToString())); throw new IOException("Error: Expected an integer type at offset " + seqSource.getPosition(), e); } return retval; } /** * This will read an long from the stream. * * @return The long that was read from the stream. * * @throws IOException If there is an error reading from the stream. */ protected long readLong() { SkipSpaces(); long retval = 0; StringBuilder longBuffer = readstringNumber(); try { retval = long.Parse(longBuffer.ToString()); } catch (FormatException e) { seqSource.unread(OtherEncodings.StringAsLatin1Bytes(longBuffer.ToString())); throw new IOException( $"Error: Expected a long type at offset {seqSource.getPosition()}, instead got \'{longBuffer}\'", e); } return retval; } /** * This method is used to read a token by the {@linkplain #readInt()} method * and the {@linkplain #readLong()} method. * * @return the token to parse as integer or long by the calling method. * @throws IOException throws by the {@link #seqSource} methods. */ protected StringBuilder readstringNumber() { int lastByte = 0; StringBuilder buffer = new StringBuilder(); while ((lastByte = seqSource.read()) != ASCII_SPACE && lastByte != ASCII_LF && lastByte != ASCII_CR && lastByte != 60 && //see sourceforge bug 1714707 lastByte != '[' && // PDFBOX-1845 lastByte != '(' && // PDFBOX-2579 lastByte != 0 && //See sourceforge bug 853328 lastByte != -1) { buffer.Append((char)lastByte); if (buffer.Length > MAX_LENGTH_LONG) { throw new IOException("Number '" + buffer + "' is getting too long, stop reading at offset " + seqSource.getPosition()); } } if (lastByte != -1) { seqSource.unread(lastByte); } return buffer; } } }