mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-09-19 19:07:56 +08:00
fix brute force searcher offsets
the brute force searcher offsets were off by one. this change means the offset returned is now aligned with the object number in the object number/generation/operator triple.
This commit is contained in:
@@ -2,6 +2,8 @@
|
||||
namespace UglyToad.PdfPig.Tests.Parser.Parts
|
||||
{
|
||||
using System;
|
||||
using System.IO;
|
||||
using Integration;
|
||||
using PdfPig.Core;
|
||||
using PdfPig.Parser.Parts;
|
||||
using Xunit;
|
||||
@@ -56,10 +58,10 @@ startxref
|
||||
|
||||
Assert.Equal(locations.Values, new long[]
|
||||
{
|
||||
TestData.IndexOf("2 17 obj", StringComparison.OrdinalIgnoreCase) + 1,
|
||||
TestData.IndexOf("3 0 obj", StringComparison.OrdinalIgnoreCase) + 1,
|
||||
TestData.IndexOf("4 0 obj", StringComparison.OrdinalIgnoreCase) + 1,
|
||||
TestData.IndexOf("5 0 obj", StringComparison.OrdinalIgnoreCase) + 1
|
||||
TestData.IndexOf("2 17 obj", StringComparison.OrdinalIgnoreCase),
|
||||
TestData.IndexOf("3 0 obj", StringComparison.OrdinalIgnoreCase),
|
||||
TestData.IndexOf("4 0 obj", StringComparison.OrdinalIgnoreCase),
|
||||
TestData.IndexOf("5 0 obj", StringComparison.OrdinalIgnoreCase)
|
||||
});
|
||||
}
|
||||
|
||||
@@ -83,5 +85,72 @@ startxref
|
||||
Assert.Contains(newLocations.Keys, x => x.Equals(keyValuePair.Key));
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void BruteForceSearcherFileOffsetsCorrect()
|
||||
{
|
||||
using (var fs = File.OpenRead(IntegrationHelpers.GetDocumentPath("Single Page Simple - from inkscape.pdf")))
|
||||
{
|
||||
var bytes = new StreamInputBytes(fs);
|
||||
var searcher = new BruteForceSearcher(bytes);
|
||||
|
||||
var locations = searcher.GetObjectLocations();
|
||||
|
||||
Assert.Equal(13, locations.Count);
|
||||
|
||||
Assert.Equal(6183, locations[new IndirectReference(1, 0)]);
|
||||
Assert.Equal(244, locations[new IndirectReference(2, 0)]);
|
||||
Assert.Equal(15, locations[new IndirectReference(3, 0)]);
|
||||
Assert.Equal(222, locations[new IndirectReference(4, 0)]);
|
||||
Assert.Equal(5766, locations[new IndirectReference(5, 0)]);
|
||||
Assert.Equal(353, locations[new IndirectReference(6, 0)]);
|
||||
Assert.Equal(581, locations[new IndirectReference(7, 0)]);
|
||||
Assert.Equal(5068, locations[new IndirectReference(8, 0)]);
|
||||
Assert.Equal(5091, locations[new IndirectReference(9, 0)]);
|
||||
|
||||
var s = GetStringAt(bytes, locations[new IndirectReference(3, 0)]);
|
||||
Assert.StartsWith("3 0 obj", s);
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void BruteForceSearcherFileOffsetsCorrectOpenOffice()
|
||||
{
|
||||
var bytes = new ByteArrayInputBytes(File.ReadAllBytes(IntegrationHelpers.GetDocumentPath("Single Page Simple - from open office.pdf")));
|
||||
|
||||
var searcher = new BruteForceSearcher(bytes);
|
||||
|
||||
var locations = searcher.GetObjectLocations();
|
||||
|
||||
Assert.Equal(13, locations.Count);
|
||||
|
||||
Assert.Equal(17, locations[new IndirectReference(1, 0)]);
|
||||
Assert.Equal(249, locations[new IndirectReference(2, 0)]);
|
||||
Assert.Equal(14291, locations[new IndirectReference(3, 0)]);
|
||||
Assert.Equal(275, locations[new IndirectReference(4, 0)]);
|
||||
Assert.Equal(382, locations[new IndirectReference(5, 0)]);
|
||||
Assert.Equal(13283, locations[new IndirectReference(6, 0)]);
|
||||
Assert.Equal(13309, locations[new IndirectReference(7, 0)]);
|
||||
Assert.Equal(13556, locations[new IndirectReference(8, 0)]);
|
||||
Assert.Equal(13926, locations[new IndirectReference(9, 0)]);
|
||||
Assert.Equal(14183, locations[new IndirectReference(10, 0)]);
|
||||
Assert.Equal(14224, locations[new IndirectReference(11, 0)]);
|
||||
Assert.Equal(14428, locations[new IndirectReference(12, 0)]);
|
||||
Assert.Equal(14488, locations[new IndirectReference(13, 0)]);
|
||||
|
||||
var s = GetStringAt(bytes, locations[new IndirectReference(12, 0)]);
|
||||
Assert.StartsWith("12 0 obj", s);
|
||||
}
|
||||
|
||||
private static string GetStringAt(IInputBytes bytes, long location)
|
||||
{
|
||||
bytes.Seek(location);
|
||||
var txt = new byte[10];
|
||||
bytes.Read(txt);
|
||||
|
||||
var s = OtherEncodings.BytesAsLatin1String(txt);
|
||||
|
||||
return s;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -5,7 +5,6 @@
|
||||
using System.Globalization;
|
||||
using System.Text;
|
||||
using Core;
|
||||
using Exceptions;
|
||||
using Util.JetBrains.Annotations;
|
||||
|
||||
/// <summary>
|
||||
@@ -49,10 +48,6 @@
|
||||
bool endobjFound = false;
|
||||
do
|
||||
{
|
||||
if (loopProtection >= 700_000)
|
||||
{
|
||||
|
||||
}
|
||||
if (loopProtection > 1_000_000)
|
||||
{
|
||||
throw new PdfDocumentFormatException("Failed to brute-force search the file due to an infinite loop.");
|
||||
@@ -142,7 +137,7 @@
|
||||
var obj = long.Parse(objectNumberBytes.ToString(), CultureInfo.InvariantCulture);
|
||||
var generation = int.Parse(generationBytes.ToString(), CultureInfo.InvariantCulture);
|
||||
|
||||
results[new IndirectReference(obj, generation)] = bytes.CurrentOffset + 1;
|
||||
results[new IndirectReference(obj, generation)] = bytes.CurrentOffset;
|
||||
|
||||
inObject = true;
|
||||
endobjFound = false;
|
||||
|
@@ -1,10 +1,6 @@
|
||||
namespace UglyToad.PdfPig.Tokenization.Scanner
|
||||
{
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using Core;
|
||||
using CrossReference;
|
||||
using Parser.Parts;
|
||||
using Tokens;
|
||||
|
||||
internal interface IObjectLocationProvider
|
||||
@@ -17,88 +13,4 @@
|
||||
|
||||
void Cache(ObjectToken objectToken, bool force = false);
|
||||
}
|
||||
|
||||
internal class ObjectLocationProvider : IObjectLocationProvider
|
||||
{
|
||||
private readonly Dictionary<IndirectReference, ObjectToken> cache = new Dictionary<IndirectReference, ObjectToken>();
|
||||
|
||||
/// <summary>
|
||||
/// Since we want to scan objects while reading the cross reference table we lazily load it when it's ready.
|
||||
/// </summary>
|
||||
private readonly Func<CrossReferenceTable> crossReferenceTable;
|
||||
private readonly BruteForceSearcher searcher;
|
||||
|
||||
/// <summary>
|
||||
/// Indicates whether we now have a cross reference table.
|
||||
/// </summary>
|
||||
private bool loadedFromTable;
|
||||
|
||||
private readonly Dictionary<IndirectReference, long> offsets = new Dictionary<IndirectReference, long>();
|
||||
|
||||
public ObjectLocationProvider(Func<CrossReferenceTable> crossReferenceTable, BruteForceSearcher searcher)
|
||||
{
|
||||
this.crossReferenceTable = crossReferenceTable;
|
||||
this.searcher = searcher;
|
||||
}
|
||||
|
||||
public bool TryGetOffset(IndirectReference reference, out long offset)
|
||||
{
|
||||
if (!loadedFromTable)
|
||||
{
|
||||
var table = crossReferenceTable.Invoke();
|
||||
|
||||
if (table != null)
|
||||
{
|
||||
foreach (var objectOffset in table.ObjectOffsets)
|
||||
{
|
||||
offsets[objectOffset.Key] = objectOffset.Value;
|
||||
}
|
||||
|
||||
loadedFromTable = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (offsets.TryGetValue(reference, out offset))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
var locations = searcher.GetObjectLocations();
|
||||
|
||||
if (locations.TryGetValue(reference, out offset))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public void UpdateOffset(IndirectReference reference, long offset)
|
||||
{
|
||||
offsets[reference] = offset;
|
||||
}
|
||||
|
||||
public bool TryGetCached(IndirectReference reference, out ObjectToken objectToken)
|
||||
{
|
||||
return cache.TryGetValue(reference, out objectToken);
|
||||
}
|
||||
|
||||
public void Cache(ObjectToken objectToken, bool force = false)
|
||||
{
|
||||
if (objectToken == null)
|
||||
{
|
||||
throw new ArgumentNullException();
|
||||
}
|
||||
|
||||
// Don't cache incorrect locations.
|
||||
var crossReference = crossReferenceTable();
|
||||
if (!force && crossReference != null && crossReference.ObjectOffsets.TryGetValue(objectToken.Number, out var expected)
|
||||
&& objectToken.Position != expected)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
cache[objectToken.Number] = objectToken;
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,93 @@
|
||||
namespace UglyToad.PdfPig.Tokenization.Scanner
|
||||
{
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using Core;
|
||||
using CrossReference;
|
||||
using Parser.Parts;
|
||||
using Tokens;
|
||||
|
||||
internal class ObjectLocationProvider : IObjectLocationProvider
|
||||
{
|
||||
private readonly Dictionary<IndirectReference, ObjectToken> cache = new Dictionary<IndirectReference, ObjectToken>();
|
||||
|
||||
/// <summary>
|
||||
/// Since we want to scan objects while reading the cross reference table we lazily load it when it's ready.
|
||||
/// </summary>
|
||||
private readonly Func<CrossReferenceTable> crossReferenceTable;
|
||||
private readonly BruteForceSearcher searcher;
|
||||
|
||||
/// <summary>
|
||||
/// Indicates whether we now have a cross reference table.
|
||||
/// </summary>
|
||||
private bool loadedFromTable;
|
||||
|
||||
private readonly Dictionary<IndirectReference, long> offsets = new Dictionary<IndirectReference, long>();
|
||||
|
||||
public ObjectLocationProvider(Func<CrossReferenceTable> crossReferenceTable, BruteForceSearcher searcher)
|
||||
{
|
||||
this.crossReferenceTable = crossReferenceTable;
|
||||
this.searcher = searcher;
|
||||
}
|
||||
|
||||
public bool TryGetOffset(IndirectReference reference, out long offset)
|
||||
{
|
||||
if (!loadedFromTable)
|
||||
{
|
||||
var table = crossReferenceTable.Invoke();
|
||||
|
||||
if (table != null)
|
||||
{
|
||||
foreach (var objectOffset in table.ObjectOffsets)
|
||||
{
|
||||
offsets[objectOffset.Key] = objectOffset.Value;
|
||||
}
|
||||
|
||||
loadedFromTable = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (offsets.TryGetValue(reference, out offset))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
var locations = searcher.GetObjectLocations();
|
||||
|
||||
if (locations.TryGetValue(reference, out offset))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public void UpdateOffset(IndirectReference reference, long offset)
|
||||
{
|
||||
offsets[reference] = offset;
|
||||
}
|
||||
|
||||
public bool TryGetCached(IndirectReference reference, out ObjectToken objectToken)
|
||||
{
|
||||
return cache.TryGetValue(reference, out objectToken);
|
||||
}
|
||||
|
||||
public void Cache(ObjectToken objectToken, bool force = false)
|
||||
{
|
||||
if (objectToken == null)
|
||||
{
|
||||
throw new ArgumentNullException();
|
||||
}
|
||||
|
||||
// Don't cache incorrect locations.
|
||||
var crossReference = crossReferenceTable();
|
||||
if (!force && crossReference != null && crossReference.ObjectOffsets.TryGetValue(objectToken.Number, out var expected)
|
||||
&& objectToken.Position != expected)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
cache[objectToken.Number] = objectToken;
|
||||
}
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user