mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-10-15 19:54:52 +08:00
begin to rework cross-reference parsing
most of the cross-reference code is the earliest code in the project and hasn't been revisited since then. the issue #88 has been reopened due to a bug with brute-force searching so this tidies up the code in this area ahead of trying to fix the bug.
This commit is contained in:
@@ -36,10 +36,18 @@ startxref
|
|||||||
216
|
216
|
||||||
%%EOF";
|
%%EOF";
|
||||||
|
|
||||||
|
private static readonly long[] TestDataOffsets =
|
||||||
|
{
|
||||||
|
TestData.IndexOf("2 17 obj", StringComparison.OrdinalIgnoreCase),
|
||||||
|
TestData.IndexOf("3 0 obj", StringComparison.OrdinalIgnoreCase),
|
||||||
|
TestData.IndexOf("4 0 obj", StringComparison.OrdinalIgnoreCase),
|
||||||
|
TestData.IndexOf("5 0 obj", StringComparison.OrdinalIgnoreCase)
|
||||||
|
};
|
||||||
|
|
||||||
[Fact]
|
[Fact]
|
||||||
public void ReaderNull_Throws()
|
public void ReaderNull_Throws()
|
||||||
{
|
{
|
||||||
Action action = () => new BruteForceSearcher(null);
|
Action action = () => BruteForceSearcher.GetObjectLocations(null);
|
||||||
|
|
||||||
Assert.Throws<ArgumentNullException>(action);
|
Assert.Throws<ArgumentNullException>(action);
|
||||||
}
|
}
|
||||||
@@ -50,19 +58,11 @@ startxref
|
|||||||
{
|
{
|
||||||
var input = new ByteArrayInputBytes(OtherEncodings.StringAsLatin1Bytes(TestData));
|
var input = new ByteArrayInputBytes(OtherEncodings.StringAsLatin1Bytes(TestData));
|
||||||
|
|
||||||
var searcher = new BruteForceSearcher(input);
|
var locations = BruteForceSearcher.GetObjectLocations(input);
|
||||||
|
|
||||||
var locations = searcher.GetObjectLocations();
|
|
||||||
|
|
||||||
Assert.Equal(4, locations.Count);
|
Assert.Equal(4, locations.Count);
|
||||||
|
|
||||||
Assert.Equal(locations.Values, new long[]
|
Assert.Equal(TestDataOffsets, locations.Values);
|
||||||
{
|
|
||||||
TestData.IndexOf("2 17 obj", StringComparison.OrdinalIgnoreCase),
|
|
||||||
TestData.IndexOf("3 0 obj", StringComparison.OrdinalIgnoreCase),
|
|
||||||
TestData.IndexOf("4 0 obj", StringComparison.OrdinalIgnoreCase),
|
|
||||||
TestData.IndexOf("5 0 obj", StringComparison.OrdinalIgnoreCase)
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
[Fact]
|
[Fact]
|
||||||
@@ -70,13 +70,11 @@ startxref
|
|||||||
{
|
{
|
||||||
var reader = StringBytesTestConverter.Convert(TestData, false);
|
var reader = StringBytesTestConverter.Convert(TestData, false);
|
||||||
|
|
||||||
var searcher = new BruteForceSearcher(reader.Bytes);
|
var locations = BruteForceSearcher.GetObjectLocations(reader.Bytes);
|
||||||
|
|
||||||
var locations = searcher.GetObjectLocations();
|
|
||||||
|
|
||||||
Assert.Equal(4, locations.Count);
|
Assert.Equal(4, locations.Count);
|
||||||
|
|
||||||
var newLocations = searcher.GetObjectLocations();
|
var newLocations = BruteForceSearcher.GetObjectLocations(reader.Bytes);
|
||||||
|
|
||||||
Assert.Equal(4, locations.Count);
|
Assert.Equal(4, locations.Count);
|
||||||
|
|
||||||
@@ -92,9 +90,8 @@ startxref
|
|||||||
using (var fs = File.OpenRead(IntegrationHelpers.GetDocumentPath("Single Page Simple - from inkscape.pdf")))
|
using (var fs = File.OpenRead(IntegrationHelpers.GetDocumentPath("Single Page Simple - from inkscape.pdf")))
|
||||||
{
|
{
|
||||||
var bytes = new StreamInputBytes(fs);
|
var bytes = new StreamInputBytes(fs);
|
||||||
var searcher = new BruteForceSearcher(bytes);
|
|
||||||
|
|
||||||
var locations = searcher.GetObjectLocations();
|
var locations = BruteForceSearcher.GetObjectLocations(bytes);
|
||||||
|
|
||||||
Assert.Equal(13, locations.Count);
|
Assert.Equal(13, locations.Count);
|
||||||
|
|
||||||
@@ -118,9 +115,7 @@ startxref
|
|||||||
{
|
{
|
||||||
var bytes = new ByteArrayInputBytes(File.ReadAllBytes(IntegrationHelpers.GetDocumentPath("Single Page Simple - from open office.pdf")));
|
var bytes = new ByteArrayInputBytes(File.ReadAllBytes(IntegrationHelpers.GetDocumentPath("Single Page Simple - from open office.pdf")));
|
||||||
|
|
||||||
var searcher = new BruteForceSearcher(bytes);
|
var locations = BruteForceSearcher.GetObjectLocations(bytes);
|
||||||
|
|
||||||
var locations = searcher.GetObjectLocations();
|
|
||||||
|
|
||||||
Assert.Equal(13, locations.Count);
|
Assert.Equal(13, locations.Count);
|
||||||
|
|
||||||
@@ -142,6 +137,18 @@ startxref
|
|||||||
Assert.StartsWith("12 0 obj", s);
|
Assert.StartsWith("12 0 obj", s);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void BruteForceSearcherCorrectlyFindsAllObjectsWhenOffset()
|
||||||
|
{
|
||||||
|
var input = new ByteArrayInputBytes(OtherEncodings.StringAsLatin1Bytes(TestData));
|
||||||
|
|
||||||
|
input.Seek(593);
|
||||||
|
|
||||||
|
var locations = BruteForceSearcher.GetObjectLocations(input);
|
||||||
|
|
||||||
|
Assert.Equal(TestDataOffsets, locations.Values);
|
||||||
|
}
|
||||||
|
|
||||||
private static string GetStringAt(IInputBytes bytes, long location)
|
private static string GetStringAt(IInputBytes bytes, long location)
|
||||||
{
|
{
|
||||||
bytes.Seek(location);
|
bytes.Seek(location);
|
||||||
|
@@ -0,0 +1,161 @@
|
|||||||
|
namespace UglyToad.PdfPig.Parser.FileStructure
|
||||||
|
{
|
||||||
|
using System;
|
||||||
|
using System.Collections.Generic;
|
||||||
|
using Core;
|
||||||
|
using CrossReference;
|
||||||
|
using Logging;
|
||||||
|
using Parts;
|
||||||
|
|
||||||
|
internal static class CrossReferenceObjectOffsetValidator
|
||||||
|
{
|
||||||
|
private const long MinimumSearchOffset = 6;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Check that the offsets in the cross reference are correct.
|
||||||
|
/// </summary>
|
||||||
|
public static bool ValidateCrossReferenceOffsets(IInputBytes bytes, CrossReferenceTable crossReferenceTable, ILog log,
|
||||||
|
out IReadOnlyDictionary<IndirectReference, long> actualOffsets)
|
||||||
|
{
|
||||||
|
actualOffsets = crossReferenceTable.ObjectOffsets;
|
||||||
|
|
||||||
|
if (ValidateXrefOffsets(bytes, crossReferenceTable.ObjectOffsets, log))
|
||||||
|
{
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
var bruteForceOffsets = BruteForceSearcher.GetObjectLocations(bytes);
|
||||||
|
if (bruteForceOffsets.Count > 0)
|
||||||
|
{
|
||||||
|
var objStreams = new List<IndirectReference>();
|
||||||
|
|
||||||
|
// find all object streams
|
||||||
|
foreach (var entry in crossReferenceTable.ObjectOffsets)
|
||||||
|
{
|
||||||
|
var offset = entry.Value;
|
||||||
|
if (offset < 0)
|
||||||
|
{
|
||||||
|
var objStream = new IndirectReference(-offset, 0);
|
||||||
|
if (!objStreams.Contains(objStream))
|
||||||
|
{
|
||||||
|
objStreams.Add(new IndirectReference(-offset, 0));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// remove all found object streams
|
||||||
|
if (objStreams.Count > 0)
|
||||||
|
{
|
||||||
|
foreach (var key in objStreams)
|
||||||
|
{
|
||||||
|
if (bruteForceOffsets.ContainsKey(key))
|
||||||
|
{
|
||||||
|
// remove all parsed objects which are part of an object stream
|
||||||
|
//ISet<long> objects = xrefTrailerResolver
|
||||||
|
// .getContainedObjectNumbers((int)(key.Number));
|
||||||
|
//foreach (long objNr in objects)
|
||||||
|
//{
|
||||||
|
// CosObjectKey streamObjectKey = new CosObjectKey(objNr, 0);
|
||||||
|
|
||||||
|
// if (bfCOSObjectKeyOffsets.TryGetValue(streamObjectKey, out long streamObjectOffset) && streamObjectOffset > 0)
|
||||||
|
// {
|
||||||
|
// bfCOSObjectKeyOffsets.Remove(streamObjectKey);
|
||||||
|
// }
|
||||||
|
//}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// remove all objects which are part of an object stream which wasn't found
|
||||||
|
//ISet<long> objects = xrefTrailerResolver
|
||||||
|
// .getContainedObjectNumbers((int)(key.Number));
|
||||||
|
//foreach (long objNr in objects)
|
||||||
|
//{
|
||||||
|
// xrefOffset.Remove(new CosObjectKey(objNr, 0));
|
||||||
|
//}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
foreach (var item in bruteForceOffsets)
|
||||||
|
{
|
||||||
|
//xrefOffset[item.Key] = item.Value;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static bool ValidateXrefOffsets(IInputBytes bytes, IReadOnlyDictionary<IndirectReference, long> objectOffsets, ILog log)
|
||||||
|
{
|
||||||
|
if (objectOffsets == null)
|
||||||
|
{
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
foreach (var objectEntry in objectOffsets)
|
||||||
|
{
|
||||||
|
var objectKey = objectEntry.Key;
|
||||||
|
var objectOffset = objectEntry.Value;
|
||||||
|
|
||||||
|
if (objectOffset < 0)
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!CheckObjectKeys(bytes, objectKey, objectOffset))
|
||||||
|
{
|
||||||
|
log.Error($"At least one cross-reference offset was incorrect. {objectKey} could not be found at {objectOffset}. " +
|
||||||
|
"Using brute-force search to repair object offsets.");
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static bool CheckObjectKeys(IInputBytes bytes, IndirectReference objectKey, long offset)
|
||||||
|
{
|
||||||
|
// there can't be any object at the very beginning of a pdf
|
||||||
|
if (offset < MinimumSearchOffset)
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
var objectNr = objectKey.ObjectNumber;
|
||||||
|
long objectGen = objectKey.Generation;
|
||||||
|
var originOffset = bytes.CurrentOffset;
|
||||||
|
|
||||||
|
var objectString = ObjectHelper.CreateObjectString(objectNr, objectGen);
|
||||||
|
|
||||||
|
try
|
||||||
|
{
|
||||||
|
bytes.Seek(offset);
|
||||||
|
|
||||||
|
if (ReadHelper.IsWhitespace(bytes.CurrentByte))
|
||||||
|
{
|
||||||
|
bytes.MoveNext();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ReadHelper.IsString(bytes, objectString))
|
||||||
|
{
|
||||||
|
// everything is ok, return origin object key
|
||||||
|
bytes.Seek(originOffset);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (Exception)
|
||||||
|
{
|
||||||
|
// Swallow the exception, obviously there isn't any valid object number
|
||||||
|
}
|
||||||
|
finally
|
||||||
|
{
|
||||||
|
bytes.Seek(originOffset);
|
||||||
|
}
|
||||||
|
|
||||||
|
// no valid object number found
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@@ -4,7 +4,6 @@
|
|||||||
using System.Collections.Generic;
|
using System.Collections.Generic;
|
||||||
using Core;
|
using Core;
|
||||||
using CrossReference;
|
using CrossReference;
|
||||||
using Exceptions;
|
|
||||||
using Logging;
|
using Logging;
|
||||||
using Parts.CrossReference;
|
using Parts.CrossReference;
|
||||||
using Tokenization.Scanner;
|
using Tokenization.Scanner;
|
||||||
@@ -15,16 +14,13 @@
|
|||||||
private readonly ILog log;
|
private readonly ILog log;
|
||||||
private readonly XrefOffsetValidator offsetValidator;
|
private readonly XrefOffsetValidator offsetValidator;
|
||||||
private readonly CrossReferenceStreamParser crossReferenceStreamParser;
|
private readonly CrossReferenceStreamParser crossReferenceStreamParser;
|
||||||
private readonly XrefCosOffsetChecker xrefCosChecker;
|
|
||||||
|
|
||||||
public CrossReferenceParser(ILog log, XrefOffsetValidator offsetValidator,
|
public CrossReferenceParser(ILog log, XrefOffsetValidator offsetValidator,
|
||||||
XrefCosOffsetChecker xrefCosChecker,
|
|
||||||
CrossReferenceStreamParser crossReferenceStreamParser)
|
CrossReferenceStreamParser crossReferenceStreamParser)
|
||||||
{
|
{
|
||||||
this.log = log;
|
this.log = log;
|
||||||
this.offsetValidator = offsetValidator;
|
this.offsetValidator = offsetValidator;
|
||||||
this.crossReferenceStreamParser = crossReferenceStreamParser;
|
this.crossReferenceStreamParser = crossReferenceStreamParser;
|
||||||
this.xrefCosChecker = xrefCosChecker;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public CrossReferenceTable Parse(IInputBytes bytes, bool isLenientParsing, long crossReferenceLocation,
|
public CrossReferenceTable Parse(IInputBytes bytes, bool isLenientParsing, long crossReferenceLocation,
|
||||||
@@ -214,7 +210,10 @@
|
|||||||
var resolved = table.Build(crossReferenceLocation, log);
|
var resolved = table.Build(crossReferenceLocation, log);
|
||||||
|
|
||||||
// check the offsets of all referenced objects
|
// check the offsets of all referenced objects
|
||||||
xrefCosChecker.CheckCrossReferenceOffsets(bytes, resolved, isLenientParsing);
|
if (!CrossReferenceObjectOffsetValidator.ValidateCrossReferenceOffsets(bytes, resolved, log, out var actualOffsets))
|
||||||
|
{
|
||||||
|
resolved = new CrossReferenceTable(resolved.Type, actualOffsets, resolved.Trailer, resolved.CrossReferenceOffsets);
|
||||||
|
}
|
||||||
|
|
||||||
return resolved;
|
return resolved;
|
||||||
}
|
}
|
||||||
|
@@ -1,236 +0,0 @@
|
|||||||
namespace UglyToad.PdfPig.Parser.FileStructure
|
|
||||||
{
|
|
||||||
using System;
|
|
||||||
using System.Collections.Generic;
|
|
||||||
using System.Linq;
|
|
||||||
using Core;
|
|
||||||
using CrossReference;
|
|
||||||
using Logging;
|
|
||||||
using Parts;
|
|
||||||
|
|
||||||
internal class XrefCosOffsetChecker
|
|
||||||
{
|
|
||||||
private static readonly long MINIMUM_SEARCH_OFFSET = 6;
|
|
||||||
|
|
||||||
private readonly ILog log;
|
|
||||||
private readonly BruteForceSearcher bruteForceSearcher;
|
|
||||||
|
|
||||||
private IReadOnlyDictionary<IndirectReference, long> objectKeyOffsets;
|
|
||||||
|
|
||||||
public XrefCosOffsetChecker(ILog log, BruteForceSearcher bruteForceSearcher)
|
|
||||||
{
|
|
||||||
this.log = log;
|
|
||||||
this.bruteForceSearcher = bruteForceSearcher;
|
|
||||||
}
|
|
||||||
|
|
||||||
private bool ValidateXrefOffsets(IInputBytes bytes, Dictionary<IndirectReference, long> xrefOffset)
|
|
||||||
{
|
|
||||||
if (xrefOffset == null)
|
|
||||||
{
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
foreach (var objectEntry in xrefOffset)
|
|
||||||
{
|
|
||||||
IndirectReference objectKey = objectEntry.Key;
|
|
||||||
long objectOffset = objectEntry.Value;
|
|
||||||
|
|
||||||
// a negative offset number represents a object number itself
|
|
||||||
// see type 2 entry in xref stream
|
|
||||||
if (objectOffset >= 0 && !CheckObjectKeys(bytes, objectKey, objectOffset))
|
|
||||||
{
|
|
||||||
log.Debug($"Stop checking xref offsets as at least one ({objectKey}) couldn't be dereferenced");
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
private bool CheckObjectKeys(IInputBytes bytes, IndirectReference objectKey, long offset)
|
|
||||||
{
|
|
||||||
// there can't be any object at the very beginning of a pdf
|
|
||||||
if (offset < MINIMUM_SEARCH_OFFSET)
|
|
||||||
{
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
long objectNr = objectKey.ObjectNumber;
|
|
||||||
long objectGen = objectKey.Generation;
|
|
||||||
long originOffset = bytes.CurrentOffset;
|
|
||||||
|
|
||||||
string objectString = ObjectHelper.CreateObjectString(objectNr, objectGen);
|
|
||||||
|
|
||||||
try
|
|
||||||
{
|
|
||||||
bytes.Seek(offset);
|
|
||||||
|
|
||||||
if (ReadHelper.IsWhitespace(bytes.CurrentByte))
|
|
||||||
{
|
|
||||||
bytes.MoveNext();
|
|
||||||
}
|
|
||||||
|
|
||||||
if (ReadHelper.IsString(bytes, objectString))
|
|
||||||
{
|
|
||||||
// everything is ok, return origin object key
|
|
||||||
bytes.Seek(originOffset);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
catch (Exception)
|
|
||||||
{
|
|
||||||
// Swallow the exception, obviously there isn't any valid object number
|
|
||||||
}
|
|
||||||
finally
|
|
||||||
{
|
|
||||||
bytes.Seek(originOffset);
|
|
||||||
}
|
|
||||||
|
|
||||||
// no valid object number found
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private IReadOnlyDictionary<IndirectReference, long> getBFCosObjectOffsets()
|
|
||||||
{
|
|
||||||
if (objectKeyOffsets == null)
|
|
||||||
{
|
|
||||||
var offsets = bruteForceSearcher.GetObjectLocations();
|
|
||||||
|
|
||||||
objectKeyOffsets = offsets;
|
|
||||||
}
|
|
||||||
|
|
||||||
return objectKeyOffsets;
|
|
||||||
}
|
|
||||||
|
|
||||||
/// <summary>
|
|
||||||
/// Check that the offsets in the cross reference are correct.
|
|
||||||
/// </summary>
|
|
||||||
public void CheckCrossReferenceOffsets(IInputBytes bytes, CrossReferenceTable xrefTrailerResolver, bool isLenientParsing)
|
|
||||||
{
|
|
||||||
// repair mode isn't available in non-lenient mode
|
|
||||||
if (!isLenientParsing)
|
|
||||||
{
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
Dictionary<IndirectReference, long> xrefOffset = xrefTrailerResolver.ObjectOffsets.ToDictionary(x => x.Key, x => x.Value);
|
|
||||||
if (ValidateXrefOffsets(bytes, xrefOffset))
|
|
||||||
{
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
IReadOnlyDictionary<IndirectReference, long> bfCOSObjectKeyOffsets = getBFCosObjectOffsets();
|
|
||||||
if (bfCOSObjectKeyOffsets.Count > 0)
|
|
||||||
{
|
|
||||||
List<IndirectReference> objStreams = new List<IndirectReference>();
|
|
||||||
// find all object streams
|
|
||||||
foreach (var entry in xrefOffset)
|
|
||||||
{
|
|
||||||
long offset = entry.Value;
|
|
||||||
if (offset < 0)
|
|
||||||
{
|
|
||||||
IndirectReference objStream = new IndirectReference(-offset, 0);
|
|
||||||
if (!objStreams.Contains(objStream))
|
|
||||||
{
|
|
||||||
objStreams.Add(new IndirectReference(-offset, 0));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// remove all found object streams
|
|
||||||
if (objStreams.Count > 0)
|
|
||||||
{
|
|
||||||
foreach (IndirectReference key in objStreams)
|
|
||||||
{
|
|
||||||
if (bfCOSObjectKeyOffsets.ContainsKey(key))
|
|
||||||
{
|
|
||||||
// remove all parsed objects which are part of an object stream
|
|
||||||
//ISet<long> objects = xrefTrailerResolver
|
|
||||||
// .getContainedObjectNumbers((int)(key.Number));
|
|
||||||
//foreach (long objNr in objects)
|
|
||||||
//{
|
|
||||||
// CosObjectKey streamObjectKey = new CosObjectKey(objNr, 0);
|
|
||||||
|
|
||||||
// if (bfCOSObjectKeyOffsets.TryGetValue(streamObjectKey, out long streamObjectOffset) && streamObjectOffset > 0)
|
|
||||||
// {
|
|
||||||
// bfCOSObjectKeyOffsets.Remove(streamObjectKey);
|
|
||||||
// }
|
|
||||||
//}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
// remove all objects which are part of an object stream which wasn't found
|
|
||||||
//ISet<long> objects = xrefTrailerResolver
|
|
||||||
// .getContainedObjectNumbers((int)(key.Number));
|
|
||||||
//foreach (long objNr in objects)
|
|
||||||
//{
|
|
||||||
// xrefOffset.Remove(new CosObjectKey(objNr, 0));
|
|
||||||
//}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
foreach (var item in bfCOSObjectKeyOffsets)
|
|
||||||
{
|
|
||||||
xrefOffset[item.Key] = item.Value;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private long? lastEndOfFileMarker;
|
|
||||||
|
|
||||||
private void BruteForceSearchForEndOfFileMarker(IInputBytes source)
|
|
||||||
{
|
|
||||||
if (lastEndOfFileMarker != null)
|
|
||||||
{
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
long startOffset = source.CurrentOffset;
|
|
||||||
|
|
||||||
source.Seek(MINIMUM_SEARCH_OFFSET);
|
|
||||||
|
|
||||||
while (!source.IsAtEnd())
|
|
||||||
{
|
|
||||||
// search for EOF marker
|
|
||||||
if (ReadHelper.IsString(source, "%%EOF"))
|
|
||||||
{
|
|
||||||
long tempMarker = source.CurrentOffset;
|
|
||||||
|
|
||||||
if (tempMarker >= source.Length)
|
|
||||||
{
|
|
||||||
lastEndOfFileMarker = tempMarker;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
try
|
|
||||||
{
|
|
||||||
source.Seek(tempMarker + 5);
|
|
||||||
// check if the following data is some valid pdf content
|
|
||||||
// which most likely indicates that the pdf is linearized,
|
|
||||||
// updated or just cut off somewhere in the middle
|
|
||||||
ReadHelper.SkipSpaces(source);
|
|
||||||
ObjectHelper.ReadObjectNumber(source);
|
|
||||||
ObjectHelper.ReadGenerationNumber(source);
|
|
||||||
}
|
|
||||||
catch (Exception)
|
|
||||||
{
|
|
||||||
// save the EOF marker as the following data is most likely some garbage
|
|
||||||
lastEndOfFileMarker = tempMarker;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
source.MoveNext();
|
|
||||||
}
|
|
||||||
|
|
||||||
source.Seek(startOffset);
|
|
||||||
|
|
||||||
// no EOF marker found
|
|
||||||
if (lastEndOfFileMarker == null)
|
|
||||||
{
|
|
||||||
lastEndOfFileMarker = long.MaxValue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
@@ -2,20 +2,16 @@
|
|||||||
{
|
{
|
||||||
using System;
|
using System;
|
||||||
using Content;
|
using Content;
|
||||||
using Parts;
|
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// For objects which provide document scoped caching.
|
/// For objects which provide document scoped caching.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
internal class ParsingCachingProviders
|
internal class ParsingCachingProviders
|
||||||
{
|
{
|
||||||
public BruteForceSearcher BruteForceSearcher { get; }
|
|
||||||
|
|
||||||
public IResourceStore ResourceContainer { get; }
|
public IResourceStore ResourceContainer { get; }
|
||||||
|
|
||||||
public ParsingCachingProviders(BruteForceSearcher bruteForceSearcher, IResourceStore resourceContainer)
|
public ParsingCachingProviders(IResourceStore resourceContainer)
|
||||||
{
|
{
|
||||||
BruteForceSearcher = bruteForceSearcher ?? throw new ArgumentNullException(nameof(bruteForceSearcher));
|
|
||||||
ResourceContainer = resourceContainer ?? throw new ArgumentNullException(nameof(resourceContainer));
|
ResourceContainer = resourceContainer ?? throw new ArgumentNullException(nameof(resourceContainer));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -8,44 +8,37 @@
|
|||||||
using Util.JetBrains.Annotations;
|
using Util.JetBrains.Annotations;
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Store the results of a brute force search for all objects in the document so we only do it once.
|
/// Brute force search for all objects in the document.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
internal class BruteForceSearcher
|
internal static class BruteForceSearcher
|
||||||
{
|
{
|
||||||
private const int MinimumSearchOffset = 6;
|
private const int MinimumSearchOffset = 6;
|
||||||
|
|
||||||
private readonly IInputBytes bytes;
|
/// <summary>
|
||||||
|
/// Find the offset of every object contained in the document by searching the entire document contents.
|
||||||
private Dictionary<IndirectReference, long> objectLocations;
|
/// </summary>
|
||||||
|
/// <param name="bytes">The bytes of the document.</param>
|
||||||
public BruteForceSearcher([NotNull] IInputBytes bytes)
|
/// <returns>The object keys and offsets for the objects in this document.</returns>
|
||||||
{
|
|
||||||
this.bytes = bytes ?? throw new ArgumentNullException(nameof(bytes));
|
|
||||||
}
|
|
||||||
|
|
||||||
[NotNull]
|
[NotNull]
|
||||||
public IReadOnlyDictionary<IndirectReference, long> GetObjectLocations()
|
public static IReadOnlyDictionary<IndirectReference, long> GetObjectLocations(IInputBytes bytes)
|
||||||
{
|
{
|
||||||
if (objectLocations != null)
|
if (bytes == null)
|
||||||
{
|
{
|
||||||
return objectLocations;
|
throw new ArgumentNullException(nameof(bytes));
|
||||||
}
|
}
|
||||||
|
|
||||||
var loopProtection = 0;
|
var loopProtection = 0;
|
||||||
|
|
||||||
var lastEndOfFile = GetLastEndOfFileMarker();
|
var lastEndOfFile = GetLastEndOfFileMarker(bytes);
|
||||||
|
|
||||||
var results = new Dictionary<IndirectReference, long>();
|
var results = new Dictionary<IndirectReference, long>();
|
||||||
|
|
||||||
var originPosition = bytes.CurrentOffset;
|
var originPosition = bytes.CurrentOffset;
|
||||||
|
|
||||||
long currentOffset = MinimumSearchOffset;
|
var currentOffset = (long)MinimumSearchOffset;
|
||||||
long lastObjectId = long.MinValue;
|
|
||||||
int lastGenerationId = int.MinValue;
|
var currentlyInObject = false;
|
||||||
long lastObjOffset = long.MinValue;
|
|
||||||
|
|
||||||
bool inObject = false;
|
|
||||||
bool endobjFound = false;
|
|
||||||
do
|
do
|
||||||
{
|
{
|
||||||
if (loopProtection > 1_000_000)
|
if (loopProtection > 1_000_000)
|
||||||
@@ -55,7 +48,7 @@
|
|||||||
|
|
||||||
loopProtection++;
|
loopProtection++;
|
||||||
|
|
||||||
if (inObject)
|
if (currentlyInObject)
|
||||||
{
|
{
|
||||||
if (bytes.CurrentByte == 'e')
|
if (bytes.CurrentByte == 'e')
|
||||||
{
|
{
|
||||||
@@ -65,8 +58,7 @@
|
|||||||
{
|
{
|
||||||
if (ReadHelper.IsString(bytes, "endobj"))
|
if (ReadHelper.IsString(bytes, "endobj"))
|
||||||
{
|
{
|
||||||
inObject = false;
|
currentlyInObject = false;
|
||||||
endobjFound = true;
|
|
||||||
loopProtection = 0;
|
loopProtection = 0;
|
||||||
|
|
||||||
for (int i = 0; i < "endobj".Length; i++)
|
for (int i = 0; i < "endobj".Length; i++)
|
||||||
@@ -139,8 +131,7 @@
|
|||||||
|
|
||||||
results[new IndirectReference(obj, generation)] = bytes.CurrentOffset;
|
results[new IndirectReference(obj, generation)] = bytes.CurrentOffset;
|
||||||
|
|
||||||
inObject = true;
|
currentlyInObject = true;
|
||||||
endobjFound = false;
|
|
||||||
|
|
||||||
currentOffset++;
|
currentOffset++;
|
||||||
|
|
||||||
@@ -148,22 +139,13 @@
|
|||||||
loopProtection = 0;
|
loopProtection = 0;
|
||||||
} while (currentOffset < lastEndOfFile && !bytes.IsAtEnd());
|
} while (currentOffset < lastEndOfFile && !bytes.IsAtEnd());
|
||||||
|
|
||||||
if ((lastEndOfFile < long.MaxValue || endobjFound) && lastObjOffset > 0)
|
|
||||||
{
|
|
||||||
// if the pdf wasn't cut off in the middle or if the last object ends with a "endobj" marker
|
|
||||||
// the last object id has to be added here so that it can't get lost as there isn't any subsequent object id
|
|
||||||
results[new IndirectReference(lastObjectId, lastGenerationId)] = lastObjOffset;
|
|
||||||
}
|
|
||||||
|
|
||||||
// reestablish origin position
|
// reestablish origin position
|
||||||
bytes.Seek(originPosition);
|
bytes.Seek(originPosition);
|
||||||
|
|
||||||
objectLocations = results;
|
return results;
|
||||||
|
|
||||||
return objectLocations;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private long GetLastEndOfFileMarker()
|
private static long GetLastEndOfFileMarker(IInputBytes bytes)
|
||||||
{
|
{
|
||||||
var originalOffset = bytes.CurrentOffset;
|
var originalOffset = bytes.CurrentOffset;
|
||||||
|
|
||||||
|
@@ -84,17 +84,15 @@
|
|||||||
|
|
||||||
CrossReferenceTable crossReferenceTable = null;
|
CrossReferenceTable crossReferenceTable = null;
|
||||||
|
|
||||||
var bruteForceSearcher = new BruteForceSearcher(inputBytes);
|
|
||||||
var xrefValidator = new XrefOffsetValidator(log);
|
var xrefValidator = new XrefOffsetValidator(log);
|
||||||
var objectChecker = new XrefCosOffsetChecker(log, bruteForceSearcher);
|
|
||||||
|
|
||||||
// We're ok with this since our intent is to lazily load the cross reference table.
|
// We're ok with this since our intent is to lazily load the cross reference table.
|
||||||
// ReSharper disable once AccessToModifiedClosure
|
// ReSharper disable once AccessToModifiedClosure
|
||||||
var locationProvider = new ObjectLocationProvider(() => crossReferenceTable, bruteForceSearcher);
|
var locationProvider = new ObjectLocationProvider(() => crossReferenceTable, inputBytes);
|
||||||
var pdfScanner = new PdfTokenScanner(inputBytes, locationProvider, filterProvider, NoOpEncryptionHandler.Instance);
|
var pdfScanner = new PdfTokenScanner(inputBytes, locationProvider, filterProvider, NoOpEncryptionHandler.Instance);
|
||||||
|
|
||||||
var crossReferenceStreamParser = new CrossReferenceStreamParser(filterProvider);
|
var crossReferenceStreamParser = new CrossReferenceStreamParser(filterProvider);
|
||||||
var crossReferenceParser = new CrossReferenceParser(log, xrefValidator, objectChecker, crossReferenceStreamParser);
|
var crossReferenceParser = new CrossReferenceParser(log, xrefValidator, crossReferenceStreamParser);
|
||||||
|
|
||||||
var version = FileHeaderParser.Parse(scanner, isLenientParsing, log);
|
var version = FileHeaderParser.Parse(scanner, isLenientParsing, log);
|
||||||
|
|
||||||
@@ -144,7 +142,7 @@
|
|||||||
new PageContentParser(new ReflectionGraphicsStateOperationFactory()),
|
new PageContentParser(new ReflectionGraphicsStateOperationFactory()),
|
||||||
log);
|
log);
|
||||||
|
|
||||||
var caching = new ParsingCachingProviders(bruteForceSearcher, resourceContainer);
|
var caching = new ParsingCachingProviders(resourceContainer);
|
||||||
|
|
||||||
var acroFormFactory = new AcroFormFactory(pdfScanner, filterProvider, crossReferenceTable);
|
var acroFormFactory = new AcroFormFactory(pdfScanner, filterProvider, crossReferenceTable);
|
||||||
var bookmarksProvider = new BookmarksProvider(log, pdfScanner);
|
var bookmarksProvider = new BookmarksProvider(log, pdfScanner);
|
||||||
|
@@ -15,7 +15,10 @@
|
|||||||
/// Since we want to scan objects while reading the cross reference table we lazily load it when it's ready.
|
/// Since we want to scan objects while reading the cross reference table we lazily load it when it's ready.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
private readonly Func<CrossReferenceTable> crossReferenceTable;
|
private readonly Func<CrossReferenceTable> crossReferenceTable;
|
||||||
private readonly BruteForceSearcher searcher;
|
|
||||||
|
private readonly IInputBytes bytes;
|
||||||
|
|
||||||
|
private IReadOnlyDictionary<IndirectReference, long> bruteForcedOffsets;
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Indicates whether we now have a cross reference table.
|
/// Indicates whether we now have a cross reference table.
|
||||||
@@ -24,10 +27,10 @@
|
|||||||
|
|
||||||
private readonly Dictionary<IndirectReference, long> offsets = new Dictionary<IndirectReference, long>();
|
private readonly Dictionary<IndirectReference, long> offsets = new Dictionary<IndirectReference, long>();
|
||||||
|
|
||||||
public ObjectLocationProvider(Func<CrossReferenceTable> crossReferenceTable, BruteForceSearcher searcher)
|
public ObjectLocationProvider(Func<CrossReferenceTable> crossReferenceTable, IInputBytes bytes)
|
||||||
{
|
{
|
||||||
this.crossReferenceTable = crossReferenceTable;
|
this.crossReferenceTable = crossReferenceTable;
|
||||||
this.searcher = searcher;
|
this.bytes = bytes;
|
||||||
}
|
}
|
||||||
|
|
||||||
public bool TryGetOffset(IndirectReference reference, out long offset)
|
public bool TryGetOffset(IndirectReference reference, out long offset)
|
||||||
@@ -52,14 +55,12 @@
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
var locations = searcher.GetObjectLocations();
|
if (bruteForcedOffsets == null)
|
||||||
|
|
||||||
if (locations.TryGetValue(reference, out offset))
|
|
||||||
{
|
{
|
||||||
return true;
|
bruteForcedOffsets = BruteForceSearcher.GetObjectLocations(bytes);
|
||||||
}
|
}
|
||||||
|
|
||||||
return false;
|
return bruteForcedOffsets.TryGetValue(reference, out offset);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void UpdateOffset(IndirectReference reference, long offset)
|
public void UpdateOffset(IndirectReference reference, long offset)
|
||||||
|
@@ -73,14 +73,13 @@
|
|||||||
|
|
||||||
var version = FileHeaderParser.Parse(coreScanner, isLenientParsing, Log);
|
var version = FileHeaderParser.Parse(coreScanner, isLenientParsing, Log);
|
||||||
|
|
||||||
var bruteForceSearcher = new BruteForceSearcher(inputBytes);
|
var crossReferenceParser = new CrossReferenceParser(Log, new XrefOffsetValidator(Log),
|
||||||
var crossReferenceParser = new CrossReferenceParser(Log, new XrefOffsetValidator(Log), new XrefCosOffsetChecker(Log, bruteForceSearcher),
|
|
||||||
new Parser.Parts.CrossReference.CrossReferenceStreamParser(FilterProvider));
|
new Parser.Parts.CrossReference.CrossReferenceStreamParser(FilterProvider));
|
||||||
|
|
||||||
CrossReferenceTable crossReference = null;
|
CrossReferenceTable crossReference = null;
|
||||||
|
|
||||||
// ReSharper disable once AccessToModifiedClosure
|
// ReSharper disable once AccessToModifiedClosure
|
||||||
var locationProvider = new ObjectLocationProvider(() => crossReference, bruteForceSearcher);
|
var locationProvider = new ObjectLocationProvider(() => crossReference, inputBytes);
|
||||||
|
|
||||||
var pdfScanner = new PdfTokenScanner(inputBytes, locationProvider, FilterProvider, NoOpEncryptionHandler.Instance);
|
var pdfScanner = new PdfTokenScanner(inputBytes, locationProvider, FilterProvider, NoOpEncryptionHandler.Instance);
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user