remove all old parsing logic

This commit is contained in:
Eliot Jones
2018-01-21 14:48:49 +00:00
parent da7d83d863
commit e24a306c31
12 changed files with 141 additions and 1021 deletions

View File

@@ -1,57 +0,0 @@
// ReSharper disable ObjectCreationAsStatement
namespace UglyToad.PdfPig.Tests.Parser.Parts
{
using System;
using IO;
using PdfPig.Cos;
using PdfPig.Parser.Parts;
using Xunit;
public class CosDictionaryParserTests
{
private readonly CosNameParser nameParser = new CosNameParser();
private readonly CosDictionaryParser parser;
public CosDictionaryParserTests()
{
parser = new CosDictionaryParser(nameParser, new TestingLog());
}
[Fact]
public void NameParserIsNull_Throws()
{
Action action = () => new CosDictionaryParser(null, new TestingLog());
Assert.Throws<ArgumentNullException>(action);
}
[Fact]
public void RandomAccessReadIsNull_Throws()
{
var baseParser = new CosBaseParser(nameParser, new CosStringParser(), parser, new CosArrayParser());
Action action = () => parser.Parse(null, baseParser, new CosObjectPool());
Assert.Throws<ArgumentNullException>(action);
}
[Fact]
public void BaseParserIsNull_Throws()
{
Action action = () => parser.Parse(new RandomAccessBuffer(), null, new CosObjectPool());
Assert.Throws<ArgumentNullException>(action);
}
[Fact]
public void DocumentIsNull_Throws()
{
var baseParser = new CosBaseParser(nameParser, new CosStringParser(), parser, new CosArrayParser());
Action action = () => parser.Parse(new RandomAccessBuffer(), baseParser, null);
Assert.Throws<ArgumentNullException>(action);
}
}
}

View File

@@ -1,23 +0,0 @@
namespace UglyToad.PdfPig.Tests
{
using IO;
using PdfPig.ContentStream;
using PdfPig.Cos;
using PdfPig.Parser.Parts;
internal class TestDictionaryParser : IDictionaryParser
{
public PdfDictionary Parse(IRandomAccessRead reader, IBaseParser baseParser, CosObjectPool pool)
{
return new PdfDictionary();
}
}
internal class TestBaseParser : IBaseParser
{
public CosBase Parse(IRandomAccessRead reader, CosObjectPool pool)
{
return CosNull.Null;
}
}
}

View File

@@ -1,5 +1,8 @@
namespace UglyToad.PdfPig.Parser.FileStructure
{
using IO;
using Tokenization.Scanner;
internal class CrossReferenceOffsetValidator
{
private readonly XrefOffsetValidator offsetValidator;
@@ -9,9 +12,9 @@
this.offsetValidator = offsetValidator;
}
public long Validate(long crossReferenceOffset, bool isLenientParsing)
public long Validate(long crossReferenceOffset, ISeekableTokenScanner scanner, IRandomAccessRead reader, bool isLenientParsing)
{
long fixedOffset = offsetValidator.CheckXRefOffset(crossReferenceOffset, isLenientParsing);
long fixedOffset = offsetValidator.CheckXRefOffset(crossReferenceOffset, scanner, reader, isLenientParsing);
if (fixedOffset > -1)
{
crossReferenceOffset = fixedOffset;

View File

@@ -6,7 +6,6 @@
using Exceptions;
using IO;
using Logging;
using Parts;
using Parts.CrossReference;
using Tokenization.Scanner;
using Tokenization.Tokens;
@@ -14,19 +13,17 @@
internal class CrossReferenceParser
{
private readonly ILog log;
private readonly CosDictionaryParser dictionaryParser;
private readonly CosBaseParser baseParser;
private readonly XrefOffsetValidator offsetValidator;
private readonly CrossReferenceStreamParser crossReferenceStreamParser;
private readonly CrossReferenceTableParser crossReferenceTableParser;
private readonly XrefCosOffsetChecker xrefCosChecker;
public CrossReferenceParser(ILog log, CosDictionaryParser dictionaryParser, CosBaseParser baseParser,
public CrossReferenceParser(ILog log, XrefOffsetValidator offsetValidator,
CrossReferenceStreamParser crossReferenceStreamParser,
CrossReferenceTableParser crossReferenceTableParser)
{
this.log = log;
this.dictionaryParser = dictionaryParser;
this.baseParser = baseParser;
this.offsetValidator = offsetValidator;
this.crossReferenceStreamParser = crossReferenceStreamParser;
this.crossReferenceTableParser = crossReferenceTableParser;
@@ -36,8 +33,7 @@
public CrossReferenceTable Parse(IRandomAccessRead reader, bool isLenientParsing, long xrefLocation,
CosObjectPool pool, IPdfTokenScanner pdfScanner, ISeekableTokenScanner tokenScanner)
{
var xrefOffsetValidator = new XrefOffsetValidator(log, reader, dictionaryParser, baseParser, pool);
long fixedOffset = xrefOffsetValidator.CheckXRefOffset(xrefLocation, isLenientParsing);
long fixedOffset = offsetValidator.CheckXRefOffset(xrefLocation, tokenScanner, reader, isLenientParsing);
if (fixedOffset > -1)
{
xrefLocation = fixedOffset;
@@ -81,7 +77,7 @@
int streamOffset = ((NumericToken)tableDictionary.Data[NameToken.XrefStm]).Int;
// check the xref stream reference
fixedOffset = xrefOffsetValidator.CheckXRefOffset(streamOffset, isLenientParsing);
fixedOffset = offsetValidator.CheckXRefOffset(streamOffset, tokenScanner, reader, isLenientParsing);
if (fixedOffset > -1 && fixedOffset != streamOffset)
{
log.Warn($"/XRefStm offset {streamOffset} is incorrect, corrected to {fixedOffset}");
@@ -148,7 +144,7 @@
if (previousCrossReferenceLocation > 0)
{
// check the xref table reference
fixedOffset = xrefOffsetValidator.CheckXRefOffset(previousCrossReferenceLocation, isLenientParsing);
fixedOffset = offsetValidator.CheckXRefOffset(previousCrossReferenceLocation, tokenScanner, reader, isLenientParsing);
if (fixedOffset > -1 && fixedOffset != previousCrossReferenceLocation)
{
previousCrossReferenceLocation = fixedOffset;

View File

@@ -2,37 +2,27 @@
{
using System;
using System.Collections.Generic;
using ContentStream;
using Cos;
using IO;
using Logging;
using Parts;
using Tokenization.Scanner;
using Tokenization.Tokens;
internal class XrefOffsetValidator
{
private static readonly long MinimumSearchOffset = 6;
private readonly ILog log;
private readonly IRandomAccessRead source;
private readonly CosDictionaryParser dictionaryParser;
private readonly CosBaseParser baseParser;
private readonly CosObjectPool pool;
private List<long> bfSearchXRefTablesOffsets = null;
private List<long> bfSearchXRefStreamsOffsets = null;
private List<long> bfSearchXRefTablesOffsets;
private List<long> bfSearchXRefStreamsOffsets;
public XrefOffsetValidator(ILog log, IRandomAccessRead source, CosDictionaryParser dictionaryParser,
CosBaseParser baseParser,
CosObjectPool pool)
public XrefOffsetValidator(ILog log)
{
this.log = log;
this.source = source;
this.dictionaryParser = dictionaryParser;
this.baseParser = baseParser;
this.pool = pool;
}
public long CheckXRefOffset(long startXRefOffset, bool isLenientParsing)
public long CheckXRefOffset(long startXRefOffset, ISeekableTokenScanner scanner, IRandomAccessRead reader, bool isLenientParsing)
{
// repair mode isn't available in non-lenient mode
if (!isLenientParsing)
@@ -40,127 +30,133 @@
return startXRefOffset;
}
source.Seek(startXRefOffset);
reader.Seek(startXRefOffset);
ReadHelper.SkipSpaces(source);
ReadHelper.SkipSpaces(reader);
if (source.Peek() == 'x' && ReadHelper.IsString(source, "xref"))
if (reader.Peek() == 'x' && ReadHelper.IsString(reader, "xref"))
{
return startXRefOffset;
}
if (startXRefOffset > 0)
{
if (CheckXRefStreamOffset(source, startXRefOffset, true, pool))
if (CheckXRefStreamOffset(startXRefOffset, scanner, true))
{
return startXRefOffset;
}
return CalculateXRefFixedOffset(startXRefOffset);
return CalculateXRefFixedOffset(startXRefOffset, scanner, reader);
}
// can't find a valid offset
return -1;
}
private long CalculateXRefFixedOffset(long objectOffset)
private long CalculateXRefFixedOffset(long objectOffset, ISeekableTokenScanner scanner, IRandomAccessRead reader)
{
if (objectOffset < 0)
{
// LOG.error("Invalid object offset " + objectOffset + " when searching for a xref table/stream");
log.Error($"Invalid object offset {objectOffset} when searching for a xref table/stream");
return 0;
}
// start a brute force search for all xref tables and try to find the offset we are looking for
long newOffset = BfSearchForXRef(objectOffset);
long newOffset = BfSearchForXRef(objectOffset, scanner, reader);
if (newOffset > -1)
{
// LOG.debug("Fixed reference for xref table/stream " + objectOffset + " -> " + newOffset);
log.Debug($"Fixed reference for xref table/stream {objectOffset} -> {newOffset}");
return newOffset;
}
// LOG.error("Can't find the object xref table/stream at offset " + objectOffset);
log.Error($"Can\'t find the object xref table/stream at offset {objectOffset}");
return 0;
}
private void BfSearchForXRefStreams()
private void BfSearchForXRefStreams(IRandomAccessRead reader)
{
if (bfSearchXRefStreamsOffsets == null)
if (bfSearchXRefStreamsOffsets != null)
{
// a pdf may contain more than one /XRef entry
bfSearchXRefStreamsOffsets = new List<long>();
long originOffset = source.GetPosition();
source.Seek(MinimumSearchOffset);
// search for XRef streams
var objString = " obj";
while (!source.IsEof())
return;
}
// a pdf may contain more than one /XRef entry
bfSearchXRefStreamsOffsets = new List<long>();
long originOffset = reader.GetPosition();
reader.Seek(MinimumSearchOffset);
// search for XRef streams
var objString = " obj";
while (!reader.IsEof())
{
if (ReadHelper.IsString(reader, "xref"))
{
if (ReadHelper.IsString(source, "xref"))
// search backwards for the beginning of the stream
long newOffset = -1;
long xrefOffset = reader.GetPosition();
bool objFound = false;
for (int i = 1; i < 40 && !objFound; i++)
{
// search backwards for the beginning of the stream
long newOffset = -1;
long xrefOffset = source.GetPosition();
bool objFound = false;
for (int i = 1; i < 40 && !objFound; i++)
long currentOffset = xrefOffset - (i * 10);
if (currentOffset > 0)
{
long currentOffset = xrefOffset - (i * 10);
if (currentOffset > 0)
reader.Seek(currentOffset);
for (int j = 0; j < 10; j++)
{
source.Seek(currentOffset);
for (int j = 0; j < 10; j++)
if (ReadHelper.IsString(reader, objString))
{
if (ReadHelper.IsString(source, objString))
long tempOffset = currentOffset - 1;
reader.Seek(tempOffset);
int genId = reader.Peek();
// is the next char a digit?
if (ReadHelper.IsDigit(genId))
{
long tempOffset = currentOffset - 1;
source.Seek(tempOffset);
int genId = source.Peek();
// is the next char a digit?
if (ReadHelper.IsDigit(genId))
tempOffset--;
reader.Seek(tempOffset);
if (ReadHelper.IsSpace(reader))
{
tempOffset--;
source.Seek(tempOffset);
if (ReadHelper.IsSpace(source))
int length = 0;
reader.Seek(--tempOffset);
while (tempOffset > MinimumSearchOffset && ReadHelper.IsDigit(reader))
{
int length = 0;
source.Seek(--tempOffset);
while (tempOffset > MinimumSearchOffset && ReadHelper.IsDigit(source))
{
source.Seek(--tempOffset);
length++;
}
if (length > 0)
{
source.Read();
newOffset = source.GetPosition();
}
reader.Seek(--tempOffset);
length++;
}
if (length > 0)
{
reader.Read();
newOffset = reader.GetPosition();
}
}
objFound = true;
break;
}
else
{
currentOffset++;
source.Read();
}
objFound = true;
break;
}
else
{
currentOffset++;
reader.Read();
}
}
}
if (newOffset > -1)
{
bfSearchXRefStreamsOffsets.Add(newOffset);
}
source.Seek(xrefOffset + 5);
}
source.Read();
if (newOffset > -1)
{
bfSearchXRefStreamsOffsets.Add(newOffset);
}
reader.Seek(xrefOffset + 5);
}
source.Seek(originOffset);
reader.Read();
}
reader.Seek(originOffset);
}
private long BfSearchForXRef(long xrefOffset)
private long BfSearchForXRef(long xrefOffset, ISeekableTokenScanner scanner, IRandomAccessRead reader)
{
long newOffset = -1;
long newOffsetTable = -1;
long newOffsetStream = -1;
BfSearchForXRefTables();
BfSearchForXRefStreams();
BfSearchForXRefTables(reader);
BfSearchForXRefStreams(reader);
if (bfSearchXRefTablesOffsets != null)
{
// TODO to be optimized, this won't work in every case
@@ -200,31 +196,31 @@
return newOffset;
}
private void BfSearchForXRefTables()
private void BfSearchForXRefTables(IRandomAccessRead reader)
{
if (bfSearchXRefTablesOffsets == null)
{
// a pdf may contain more than one xref entry
bfSearchXRefTablesOffsets = new List<long>();
long originOffset = source.GetPosition();
source.Seek(MinimumSearchOffset);
long originOffset = reader.GetPosition();
reader.Seek(MinimumSearchOffset);
// search for xref tables
while (!source.IsEof())
while (!reader.IsEof())
{
if (ReadHelper.IsString(source, "xref"))
if (ReadHelper.IsString(reader, "xref"))
{
long newOffset = source.GetPosition();
source.Seek(newOffset - 1);
long newOffset = reader.GetPosition();
reader.Seek(newOffset - 1);
// ensure that we don't read "startxref" instead of "xref"
if (ReadHelper.IsWhitespace(source))
if (ReadHelper.IsWhitespace(reader))
{
bfSearchXRefTablesOffsets.Add(newOffset);
}
source.Seek(newOffset + 4);
reader.Seek(newOffset + 4);
}
source.Read();
reader.Read();
}
source.Seek(originOffset);
reader.Seek(originOffset);
}
}
@@ -252,7 +248,7 @@
return newValue;
}
private bool CheckXRefStreamOffset(IRandomAccessRead source, long startXRefOffset, bool isLenient, CosObjectPool pool)
private bool CheckXRefStreamOffset(long startXRefOffset, ISeekableTokenScanner scanner, bool isLenient)
{
// repair mode isn't available in non-lenient mode
if (!isLenient || startXRefOffset == 0)
@@ -260,37 +256,41 @@
return true;
}
// seek to offset-1
source.Seek(startXRefOffset - 1);
int nextValue = source.Read();
// the first character has to be a whitespace, and then a digit
if (ReadHelper.IsWhitespace(nextValue))
scanner.Seek(startXRefOffset - 1);
if (scanner.TryReadToken(out NumericToken objectNumber))
{
ReadHelper.SkipSpaces(source);
if (ReadHelper.IsDigit(source))
try
{
try
if (!scanner.TryReadToken(out NumericToken generation))
{
// it's a XRef stream
ObjectHelper.ReadObjectNumber(source);
ObjectHelper.ReadGenerationNumber(source);
ReadHelper.ReadExpectedString(source, "obj", true);
// check the dictionary to avoid false positives
PdfDictionary dict = dictionaryParser.Parse(source, baseParser, pool);
source.Seek(startXRefOffset);
if (dict.IsType(CosName.XREF))
{
return true;
}
log.Debug($"When checking offset at {startXRefOffset} did not find the generation number. Got: {objectNumber} {generation}.");
}
catch (Exception ex)
scanner.MoveNext();
var obj = scanner.CurrentToken;
if (!ReferenceEquals(obj, OperatorToken.StartObject))
{
log.Error("Couldn't read the xref stream object.", ex);
// there wasn't an object of a xref stream
source.Seek(startXRefOffset);
scanner.Seek(startXRefOffset);
return false;
}
// check the dictionary to avoid false positives
if (!scanner.TryReadToken(out DictionaryToken dictionary))
{
scanner.Seek(startXRefOffset);
}
if (dictionary.TryGet(NameToken.Type, out var type) && NameToken.Xref.Equals(type))
{
return true;
}
}
catch (Exception ex)
{
log.Error("Couldn't read the xref stream object.", ex);
}
}
return false;

View File

@@ -1,71 +0,0 @@
namespace UglyToad.PdfPig.Parser.Parts
{
using ContentStream;
using Cos;
using IO;
using Util;
internal class CosArrayParser
{
public COSArray Parse(IRandomAccessRead reader, CosBaseParser baseParser, CosObjectPool pool)
{
ReadHelper.ReadExpectedChar(reader, '[');
var po = new COSArray();
CosBase pbo;
ReadHelper.SkipSpaces(reader);
int i;
while (((i = reader.Peek()) > 0) && ((char)i != ']'))
{
pbo = baseParser.Parse(reader, pool);
if (pbo is CosObject)
{
// We have to check if the expected values are there or not PDFBOX-385
if (po.get(po.size() - 1) is CosInt)
{
var genNumber = (CosInt)po.remove(po.size() - 1);
if (po.get(po.size() - 1) is CosInt)
{
var number = (CosInt)po.remove(po.size() - 1);
IndirectReference key = new IndirectReference(number.AsLong(), genNumber.AsInt());
pbo = pool.Get(key);
}
else
{
// the object reference is somehow wrong
pbo = null;
}
}
else
{
pbo = null;
}
}
if (pbo != null)
{
po.add(pbo);
}
else
{
//it could be a bad object in the array which is just skipped
// LOG.warn("Corrupt object reference at offset " + seqSource.getPosition());
// This could also be an "endobj" or "endstream" which means we can assume that
// the array has ended.
string isThisTheEnd = ReadHelper.ReadString(reader);
reader.Unread(OtherEncodings.StringAsLatin1Bytes(isThisTheEnd));
if (string.Equals(isThisTheEnd, "endobj") || string.Equals(isThisTheEnd, "endstream"))
{
return po;
}
}
ReadHelper.SkipSpaces(reader);
}
// read ']'
reader.Read();
ReadHelper.SkipSpaces(reader);
return po;
}
}
}

View File

@@ -1,166 +0,0 @@
namespace UglyToad.PdfPig.Parser.Parts
{
using System.IO;
using System.Text;
using ContentStream;
using Cos;
using IO;
using Util;
internal interface IBaseParser
{
CosBase Parse(IRandomAccessRead reader, CosObjectPool pool);
}
internal class CosBaseParser : IBaseParser
{
private readonly CosNameParser nameParser;
private readonly CosStringParser stringParser;
private readonly CosDictionaryParser dictionaryParser;
private readonly CosArrayParser arrayParser;
public CosBaseParser(CosNameParser nameParser, CosStringParser stringParser,
CosDictionaryParser dictionaryParser, CosArrayParser arrayParser)
{
this.nameParser = nameParser;
this.stringParser = stringParser;
this.dictionaryParser = dictionaryParser;
this.arrayParser = arrayParser;
}
public CosBase Parse(IRandomAccessRead reader, CosObjectPool pool)
{
CosBase retval = null;
ReadHelper.SkipSpaces(reader);
int nextByte = reader.Peek();
if (nextByte == -1)
{
return null;
}
char c = (char)nextByte;
switch (c)
{
case '<':
{
// pull off first left bracket
int leftBracket = reader.Read();
// check for second left bracket
c = (char)reader.Peek();
reader.Unread(leftBracket);
if (c == '<')
{
retval = dictionaryParser.Parse(reader, this, pool);
ReadHelper.SkipSpaces(reader);
}
else
{
retval = stringParser.Parse(reader);
}
break;
}
case '[':
{
// array
retval = arrayParser.Parse(reader, this, pool);
break;
}
case '(':
retval = stringParser.Parse(reader);
break;
case '/':
// name
retval = nameParser.Parse(reader);
break;
case 'n':
{
// null
ReadHelper.ReadExpectedString(reader, "null");
retval = CosNull.Null;
break;
}
case 't':
{
string truestring = OtherEncodings.BytesAsLatin1String(reader.ReadFully(4));
if (truestring.Equals("true"))
{
retval = PdfBoolean.True;
}
else
{
throw new IOException("expected true actual='" + truestring + "' " + reader +
"' at offset " + reader.GetPosition());
}
break;
}
case 'f':
{
string falsestring = OtherEncodings.BytesAsLatin1String(reader.ReadFully(5));
if (falsestring.Equals("false"))
{
retval = PdfBoolean.False;
}
else
{
throw new IOException("expected false actual='" + falsestring + "' " + reader +
"' at offset " + reader.GetPosition());
}
break;
}
case 'R':
reader.Read();
retval = new CosObject(null);
break;
default:
if (char.IsDigit(c) || c == '-' || c == '+' || c == '.')
{
StringBuilder buf = new StringBuilder();
int ic = reader.Read();
c = (char)ic;
while (char.IsDigit(c) ||
c == '-' ||
c == '+' ||
c == '.' ||
c == 'E' ||
c == 'e')
{
buf.Append(c);
ic = reader.Read();
c = (char)ic;
}
if (ic != -1)
{
reader.Unread(ic);
}
retval = CosNumberFactory.get(buf.ToString()) as CosBase;
}
else
{
//This is not suppose to happen, but we will allow for it
//so we are more compatible with POS writers that don't
//follow the spec
string badstring = ReadHelper.ReadString(reader);
if (badstring == string.Empty)
{
int peek = reader.Peek();
// we can end up in an infinite loop otherwise
throw new IOException("Unknown dir object c='" + c +
"' cInt=" + (int)c + " peek='" + (char)peek
+ "' peekInt=" + peek + " at offset " + reader.GetPosition());
}
// if it's an endstream/endobj, we want to put it back so the caller will see it
if (string.Equals("endobj", badstring) || string.Equals("endstream", badstring))
{
reader.Unread(OtherEncodings.StringAsLatin1Bytes(badstring));
}
}
break;
}
return retval;
}
}
}

View File

@@ -1,205 +0,0 @@
namespace UglyToad.PdfPig.Parser.Parts
{
using System;
using ContentStream;
using Cos;
using IO;
using Logging;
using Util;
using Util.JetBrains.Annotations;
internal interface IDictionaryParser
{
PdfDictionary Parse(IRandomAccessRead reader, IBaseParser baseParser, CosObjectPool pool);
}
internal class CosDictionaryParser : IDictionaryParser
{
private readonly ILog log;
private readonly CosNameParser nameParser;
protected static readonly int E = 'e';
protected static readonly int N = 'n';
protected static readonly int D = 'd';
protected static readonly int S = 's';
protected static readonly int T = 't';
protected static readonly int R = 'r';
protected static readonly int A = 'a';
protected static readonly int M = 'm';
protected static readonly int O = 'o';
protected static readonly int B = 'b';
protected static readonly int J = 'j';
public CosDictionaryParser(CosNameParser nameParser, ILog log)
{
this.log = log;
this.nameParser = nameParser ?? throw new ArgumentNullException();
}
public PdfDictionary Parse(IRandomAccessRead reader, IBaseParser baseParser, CosObjectPool pool)
{
if (reader == null)
{
throw new ArgumentNullException(nameof(reader));
}
if (baseParser == null)
{
throw new ArgumentNullException(nameof(baseParser));
}
if (pool == null)
{
throw new ArgumentNullException(nameof(pool));
}
ReadHelper.ReadExpectedChar(reader, '<');
ReadHelper.ReadExpectedChar(reader, '<');
ReadHelper.SkipSpaces(reader);
var dictionary = new PdfDictionary();
var done = false;
while (!done)
{
ReadHelper.SkipSpaces(reader);
var c = (char)reader.Peek();
switch (c)
{
case '>':
done = true;
break;
case '/':
var nameValue = ParseCosDictionaryNameValuePair(reader, baseParser, pool);
if (nameValue.key != null && nameValue.value != null)
{
dictionary.Set(nameValue.key, nameValue.value);
}
break;
default:
if (ReadUntilEnd(reader))
{
return new PdfDictionary();
}
break;
}
}
ReadHelper.ReadExpectedString(reader, ">>");
return dictionary;
}
[ItemCanBeNull]
private (CosName key, CosBase value) ParseCosDictionaryNameValuePair(IRandomAccessRead reader, IBaseParser baseParser, CosObjectPool pool)
{
var key = nameParser.Parse(reader);
var value = ParseValue(reader, baseParser, pool);
ReadHelper.SkipSpaces(reader);
if ((char)reader.Peek() == 'd')
{
// if the next string is 'def' then we are parsing a cmap stream
// and want to ignore it, otherwise throw an exception.
var potentialDef = ReadHelper.ReadString(reader);
if (!potentialDef.Equals("def"))
{
reader.Unread(OtherEncodings.StringAsLatin1Bytes(potentialDef));
}
else
{
ReadHelper.SkipSpaces(reader);
}
}
if (value == null)
{
log?.Warn("Bad Dictionary Declaration " + ReadHelper.ReadString(reader));
return (null, null);
}
// label this item as direct, to avoid signature problems.
value.Direct = true;
return (key, value);
}
private static CosBase ParseValue(IRandomAccessRead reader, IBaseParser baseParser, CosObjectPool pool)
{
var numOffset = reader.GetPosition();
var value = baseParser.Parse(reader, pool);
ReadHelper.SkipSpaces(reader);
// proceed if the given object is a number and the following is a number as well
if (!(value is ICosNumber) || !ReadHelper.IsDigit(reader))
{
return value;
}
// read the remaining information of the object number
var genOffset = reader.GetPosition();
var generationNumber = baseParser.Parse(reader, pool);
ReadHelper.SkipSpaces(reader);
ReadHelper.ReadExpectedChar(reader, 'R');
if (!(value is CosInt))
{
throw new InvalidOperationException("expected number, actual=" + value + " at offset " + numOffset);
}
if (!(generationNumber is CosInt))
{
throw new InvalidOperationException("expected number, actual=" + value + " at offset " + genOffset);
}
var key = new IndirectReference(((CosInt)value).AsLong(), ((CosInt)generationNumber).AsInt());
// dereference the object
return pool.Get(key);
}
private static bool ReadUntilEnd(IRandomAccessRead reader)
{
var c = reader.Read();
while (c != -1 && c != '/' && c != '>')
{
// in addition to stopping when we find / or >, we also want
// to stop when we find endstream or endobj.
if (c == E)
{
c = reader.Read();
if (c == N)
{
c = reader.Read();
if (c == D)
{
c = reader.Read();
var isStream = c == S && reader.Read() == T && reader.Read() == R
&& reader.Read() == E && reader.Read() == A && reader.Read() == M;
var isObj = !isStream && c == O && reader.Read() == B && reader.Read() == J;
if (isStream || isObj)
{
// we're done reading this object!
return true;
}
}
}
}
c = reader.Read();
}
if (c == -1)
{
return true;
}
reader.Unread(c);
return false;
}
}
}

View File

@@ -1,88 +0,0 @@
namespace UglyToad.PdfPig.Parser.Parts
{
using System;
using System.IO;
using System.Text;
using Cos;
using IO;
using Util.JetBrains.Annotations;
internal class CosNameParser
{
[NotNull]
public CosName Parse([NotNull]IRandomAccessRead reader)
{
if (reader == null)
{
throw new ArgumentNullException(nameof(reader));
}
ReadHelper.ReadExpectedChar(reader, '/');
using (var memoryStream = new MemoryStream())
using (var writer = new BinaryWriter(memoryStream))
{
int c = reader.Read();
while (c != -1)
{
byte ch = (byte)c;
if (ch == '#')
{
int ch1 = reader.Read();
int ch2 = reader.Read();
// Prior to PDF v1.2, the # was not a special character. Also,
// it has been observed that various PDF tools do not follow the
// spec with respect to the # escape, even though they report
// PDF versions of 1.2 or later. The solution here is that we
// interpret the # as an escape only when it is followed by two
// valid hex digits.
if (ReadHelper.IsHexDigit((char)ch1) && ReadHelper.IsHexDigit((char)ch2))
{
string hex = "" + (char)ch1 + (char)ch2;
try
{
var byteToWrite = (byte)Convert.ToInt32(hex, 16);
writer.Write(byteToWrite);
}
catch (FormatException e)
{
throw new IOException("Error: expected hex digit, actual='" + hex + "'", e);
}
c = reader.Read();
}
else
{
// check for premature EOF
if (ch2 == -1 || ch1 == -1)
{
//LOG.error("Premature EOF in BaseParser#parseCosName");
c = -1;
break;
}
reader.Unread(ch2);
c = ch1;
writer.Write(ch);
}
}
else if (ReadHelper.IsEndOfName(ch))
{
break;
}
else
{
writer.Write(ch);
c = reader.Read();
}
}
if (c != -1)
{
reader.Unread(c);
}
byte[] bytes = memoryStream.ToArray();
var str = ReadHelper.IsValidUtf8(bytes) ? Encoding.UTF8.GetString(memoryStream.ToArray()) : Encoding.GetEncoding("windows-1252").GetString(memoryStream.ToArray());
return CosName.Create(str);
}
}
}
}

View File

@@ -1,260 +0,0 @@
namespace UglyToad.PdfPig.Parser.Parts
{
using System;
using System.IO;
using System.Text;
using Cos;
using IO;
internal class CosStringParser
{
public CosString Parse(IRandomAccessRead seqSource)
{
char nextChar = (char)seqSource.Read();
if (nextChar == '<')
{
return ParseHexString(seqSource);
}
if (nextChar != '(')
{
throw new IOException("parseCOSstring string should start with '(' or '<' and not '" +
nextChar + "' " + seqSource);
}
using (var memoryStream = new MemoryStream())
using (var writer = new StreamWriter(memoryStream))
{
// This is the number of braces read
int braces = 1;
int c = seqSource.Read();
while (braces > 0 && c != -1)
{
char ch = (char) c;
int nextc = -2; // not yet read
if (ch == ')')
{
braces--;
braces = CheckForEndOfString(seqSource, braces);
if (braces != 0)
{
writer.Write(ch);
}
}
else if (ch == '(')
{
braces++;
writer.Write(ch);
}
else if (ch == '\\')
{
//patched by ram
char next = (char) seqSource.Read();
switch (next)
{
case 'n':
writer.Write('\n');
break;
case 'r':
writer.Write('\r');
break;
case 't':
writer.Write('\t');
break;
case 'b':
writer.Write('\b');
break;
case 'f':
writer.Write('\f');
break;
case ')':
// PDFBox 276 /Title (c:\)
braces = CheckForEndOfString(seqSource, braces);
if (braces != 0)
{
writer.Write(next);
}
else
{
writer.Write('\\');
}
break;
case '(':
case '\\':
writer.Write(next);
break;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
{
var octal = new StringBuilder();
octal.Append(next);
c = seqSource.Read();
char digit = (char) c;
if (digit >= '0' && digit <= '7')
{
octal.Append(digit);
c = seqSource.Read();
digit = (char) c;
if (digit >= '0' && digit <= '7')
{
octal.Append(digit);
}
else
{
nextc = c;
}
}
else
{
nextc = c;
}
int character;
try
{
character = Convert.ToInt32(octal.ToString(), 8);
}
catch (FormatException e)
{
throw new IOException("Error: Expected octal character, actual='" + octal + "'", e);
}
writer.Write(character);
break;
}
default:
if (c == ReadHelper.AsciiCarriageReturn || c == ReadHelper.AsciiLineFeed)
{
// this is a break in the line so ignore it and the newline and continue
c = seqSource.Read();
while (ReadHelper.IsEndOfLine(c) && c != -1)
{
c = seqSource.Read();
}
nextc = c;
break;
}
// dropping the backslash
// see 7.3.4.2 Literal strings for further information
writer.Write(next);
break;
}
}
else
{
writer.Write(ch);
}
if (nextc != -2)
{
c = nextc;
}
else
{
c = seqSource.Read();
}
}
if (c != -1)
{
seqSource.Unread(c);
}
writer.Flush();
return new CosString(memoryStream.ToArray());
}
}
private static int CheckForEndOfString(IRandomAccessRead reader, int bracesParameter)
{
int braces = bracesParameter;
byte[] nextThreeBytes = new byte[3];
int amountRead = reader.Read(nextThreeBytes);
// Check the next 3 bytes if available
// The following cases are valid indicators for the end of the string
// 1. Next line contains another COSObject: CR + LF + '/'
// 2. CosDictionary ends in the next line: CR + LF + '>'
// 3. Next line contains another COSObject: CR + '/'
// 4. CosDictionary ends in the next line: CR + '>'
if (amountRead == 3 && nextThreeBytes[0] == ReadHelper.AsciiCarriageReturn)
{
if (nextThreeBytes[1] == ReadHelper.AsciiLineFeed && nextThreeBytes[2] == '/' || nextThreeBytes[2] == '>'
|| nextThreeBytes[1] == '/' || nextThreeBytes[1] == '>')
{
braces = 0;
}
}
if (amountRead > 0)
{
reader.Unread(nextThreeBytes, 0, amountRead);
}
return braces;
}
/// <summary>
/// This will parse a PDF HEX string with fail fast semantic meaning that we stop if a not allowed character is found.
/// This is necessary in order to detect malformed input and be able to skip to next object start.
/// We assume starting '&lt;' was already read.
/// </summary>
private static CosString ParseHexString(IRandomAccessRead reader)
{
var sBuf = new StringBuilder();
while (true)
{
int c = reader.Read();
if (ReadHelper.IsHexDigit((char)c))
{
sBuf.Append((char)c);
}
else if (c == '>')
{
break;
}
else if (c < 0)
{
throw new IOException("Missing closing bracket for hex string. Reached EOS.");
}
else if (c == ' ' || c == '\n' || c == '\t' || c == '\r' || c == '\b' || c == '\f')
{
}
else
{
// if invalid chars was found: discard last
// hex character if it is not part of a pair
if (sBuf.Length % 2 != 0)
{
sBuf.Remove(sBuf.Length - 1, 1);
}
// read till the closing bracket was found
do
{
c = reader.Read();
}
while (c != '>' && c >= 0);
// might have reached EOF while looking for the closing bracket
// this can happen for malformed PDFs only. Make sure that there is
// no endless loop.
if (c < 0)
{
throw new IOException("Missing closing bracket for hex string. Reached EOS.");
}
// exit loop
break;
}
}
return CosString.ParseHex(sBuf.ToString());
}
}
}

View File

@@ -17,6 +17,7 @@
using IO;
using Logging;
using Parts;
using Parts.CrossReference;
using Tokenization.Scanner;
using Tokenization.Tokens;
using Util;
@@ -64,18 +65,21 @@
var locationProvider = new ObjectLocationProvider(() => crossReferenceTable, pool, bruteForceSearcher);
var pdfScanner = new PdfTokenScanner(inputBytes, locationProvider, filterProvider);
var xrefValidator = new XrefOffsetValidator(log);
var crossReferenceStreamParser = new CrossReferenceStreamParser(filterProvider);
var crossReferenceParser = new CrossReferenceParser(log, xrefValidator, crossReferenceStreamParser, new CrossReferenceTableParser());
var version = container.Get<FileHeaderParser>().Parse(scanner, isLenientParsing);
var crossReferenceOffset = container.Get<FileTrailerParser>().GetFirstCrossReferenceOffset(inputBytes, scanner, isLenientParsing);
// TODO: make this use the scanner.
var validator = new CrossReferenceOffsetValidator(new XrefOffsetValidator(log, reader, container.Get<CosDictionaryParser>(),
container.Get<CosBaseParser>(), pool));
var validator = new CrossReferenceOffsetValidator(xrefValidator);
crossReferenceOffset = validator.Validate(crossReferenceOffset, isLenientParsing);
crossReferenceOffset = validator.Validate(crossReferenceOffset, scanner, reader, isLenientParsing);
crossReferenceTable = container.Get<CrossReferenceParser>()
.Parse(reader, isLenientParsing, crossReferenceOffset, pool, pdfScanner, scanner);
crossReferenceTable = crossReferenceParser.Parse(reader, isLenientParsing, crossReferenceOffset, pool, pdfScanner, scanner);
var trueTypeFontParser = new TrueTypeFontParser();
var fontDescriptorFactory = new FontDescriptorFactory();

View File

@@ -4,8 +4,6 @@
using Fonts.Parser;
using Logging;
using Parser.FileStructure;
using Parser.Parts;
using Parser.Parts.CrossReference;
internal static class Bootstrapper
{
@@ -30,13 +28,7 @@
var headerParser = new FileHeaderParser(logger);
var trailerParser = new FileTrailerParser();
var nameParser = new CosNameParser();
var dictionaryParser = new CosDictionaryParser(nameParser, logger);
var baseParser = new CosBaseParser(nameParser, new CosStringParser(), dictionaryParser, new CosArrayParser());
var filterProvider = new MemoryFilterProvider(new DecodeParameterResolver(logger), new PngPredictor(), logger);
var crossReferenceParser = new CrossReferenceStreamParser(filterProvider);
var crossReferenceTableParser = new CrossReferenceParser(logger, dictionaryParser, baseParser, crossReferenceParser, new CrossReferenceTableParser());
var cmapParser = new CMapParser();
var afmParser = new AdobeFontMetricsParser();
@@ -44,11 +36,6 @@
var container = new Container();
container.Register(headerParser);
container.Register(trailerParser);
container.Register(nameParser);
container.Register(dictionaryParser);
container.Register(baseParser);
container.Register(crossReferenceParser);
container.Register(crossReferenceTableParser);
container.Register(filterProvider);
container.Register(cmapParser);
container.Register(afmParser);