diff --git a/src/UglyToad.PdfPig.Core/ReadHelper.cs b/src/UglyToad.PdfPig.Core/ReadHelper.cs
index 6859602b..fe205bfd 100644
--- a/src/UglyToad.PdfPig.Core/ReadHelper.cs
+++ b/src/UglyToad.PdfPig.Core/ReadHelper.cs
@@ -123,7 +123,7 @@
}
///
- /// Determines if a character is whitespace or not.
+ /// Determines if a character is whitespace or not, this includes newlines.
///
///
/// These values are specified in table 1 (page 12) of ISO 32000-1:2008.
diff --git a/src/UglyToad.PdfPig.Tests/Parser/Parts/BruteForceSearcherTests.cs b/src/UglyToad.PdfPig.Tests/Parser/Parts/BruteForceSearcherTests.cs
index b8e995cc..195e744d 100644
--- a/src/UglyToad.PdfPig.Tests/Parser/Parts/BruteForceSearcherTests.cs
+++ b/src/UglyToad.PdfPig.Tests/Parser/Parts/BruteForceSearcherTests.cs
@@ -36,7 +36,7 @@ startxref
216
%%EOF";
- private static readonly long[] TestDataOffsets =
+ private static readonly long[] TestDataOffsets =
{
TestData.IndexOf("2 17 obj", StringComparison.OrdinalIgnoreCase),
TestData.IndexOf("3 0 obj", StringComparison.OrdinalIgnoreCase),
@@ -57,7 +57,7 @@ startxref
public void SearcherFindsCorrectObjects()
{
var input = new ByteArrayInputBytes(OtherEncodings.StringAsLatin1Bytes(TestData));
-
+
var locations = BruteForceSearcher.GetObjectLocations(input);
Assert.Equal(4, locations.Count);
@@ -69,11 +69,11 @@ startxref
public void ReaderOnlyCallsOnce()
{
var reader = StringBytesTestConverter.Convert(TestData, false);
-
+
var locations = BruteForceSearcher.GetObjectLocations(reader.Bytes);
Assert.Equal(4, locations.Count);
-
+
var newLocations = BruteForceSearcher.GetObjectLocations(reader.Bytes);
Assert.Equal(4, locations.Count);
@@ -132,7 +132,7 @@ endobj
5 0 obj
<< /IsEmpty false >>
endobj";
-
+
var bytes = new ByteArrayInputBytes(OtherEncodings.StringAsLatin1Bytes(s));
var locations = BruteForceSearcher.GetObjectLocations(bytes);
@@ -168,12 +168,35 @@ endobj";
Assert.Equal(581, locations[new IndirectReference(7, 0)]);
Assert.Equal(5068, locations[new IndirectReference(8, 0)]);
Assert.Equal(5091, locations[new IndirectReference(9, 0)]);
-
+
var s = GetStringAt(bytes, locations[new IndirectReference(3, 0)]);
Assert.StartsWith("3 0 obj", s);
}
}
+ [Fact]
+ public void BruteForceSearcherBytesFileOffsetsCorrect()
+ {
+ var bytes = new ByteArrayInputBytes(File.ReadAllBytes(IntegrationHelpers.GetDocumentPath("Single Page Simple - from inkscape.pdf")));
+
+ var locations = BruteForceSearcher.GetObjectLocations(bytes);
+
+ Assert.Equal(13, locations.Count);
+
+ Assert.Equal(6183, locations[new IndirectReference(1, 0)]);
+ Assert.Equal(244, locations[new IndirectReference(2, 0)]);
+ Assert.Equal(15, locations[new IndirectReference(3, 0)]);
+ Assert.Equal(222, locations[new IndirectReference(4, 0)]);
+ Assert.Equal(5766, locations[new IndirectReference(5, 0)]);
+ Assert.Equal(353, locations[new IndirectReference(6, 0)]);
+ Assert.Equal(581, locations[new IndirectReference(7, 0)]);
+ Assert.Equal(5068, locations[new IndirectReference(8, 0)]);
+ Assert.Equal(5091, locations[new IndirectReference(9, 0)]);
+
+ var s = GetStringAt(bytes, locations[new IndirectReference(3, 0)]);
+ Assert.StartsWith("3 0 obj", s);
+ }
+
[Fact]
public void BruteForceSearcherFileOffsetsCorrectOpenOffice()
{
diff --git a/src/UglyToad.PdfPig/Parser/Parts/BruteForceSearcher.cs b/src/UglyToad.PdfPig/Parser/Parts/BruteForceSearcher.cs
index ab53777f..ee5abca5 100644
--- a/src/UglyToad.PdfPig/Parser/Parts/BruteForceSearcher.cs
+++ b/src/UglyToad.PdfPig/Parser/Parts/BruteForceSearcher.cs
@@ -42,6 +42,8 @@
var currentlyInObject = false;
+ var objBuffer = new byte[4];
+
do
{
if (loopProtection > 1_000_000)
@@ -94,17 +96,24 @@
bytes.Seek(currentOffset);
- if (!ReadHelper.IsString(bytes, " obj"))
+ bytes.Read(objBuffer);
+
+ if (!IsStartObjMarker(objBuffer))
{
currentOffset++;
continue;
}
// Current byte is ' '[obj]
- var offset = currentOffset - 1;
+ var offset = currentOffset + 1;
bytes.Seek(offset);
+ while (ReadHelper.IsWhitespace(bytes.CurrentByte) && offset >= MinimumSearchOffset)
+ {
+ bytes.Seek(--offset);
+ }
+
while (ReadHelper.IsDigit(bytes.CurrentByte) && offset >= MinimumSearchOffset)
{
generationBytes.Insert(0, (char)bytes.CurrentByte);
@@ -113,13 +122,16 @@
}
// We should now be at the space between object and generation number.
- if (!ReadHelper.IsSpace(bytes.CurrentByte))
+ if (!ReadHelper.IsWhitespace(bytes.CurrentByte))
{
currentOffset++;
continue;
}
- bytes.Seek(--offset);
+ while (ReadHelper.IsWhitespace(bytes.CurrentByte))
+ {
+ bytes.Seek(--offset);
+ }
while (ReadHelper.IsDigit(bytes.CurrentByte) && offset >= MinimumSearchOffset)
{
@@ -185,5 +197,17 @@
bytes.Seek(originalOffset);
return long.MaxValue;
}
+
+ private static bool IsStartObjMarker(byte[] data)
+ {
+ if (!ReadHelper.IsWhitespace(data[0]))
+ {
+ return false;
+ }
+
+ return (data[1] == 'o' || data[1] == 'O')
+ && (data[2] == 'b' || data[2] == 'B')
+ && (data[3] == 'j' || data[3] == 'J');
+ }
}
}