diff --git a/src/UglyToad.PdfPig.Core/ReadHelper.cs b/src/UglyToad.PdfPig.Core/ReadHelper.cs index 6859602b..fe205bfd 100644 --- a/src/UglyToad.PdfPig.Core/ReadHelper.cs +++ b/src/UglyToad.PdfPig.Core/ReadHelper.cs @@ -123,7 +123,7 @@ } /// - /// Determines if a character is whitespace or not. + /// Determines if a character is whitespace or not, this includes newlines. /// /// /// These values are specified in table 1 (page 12) of ISO 32000-1:2008. diff --git a/src/UglyToad.PdfPig.Tests/Parser/Parts/BruteForceSearcherTests.cs b/src/UglyToad.PdfPig.Tests/Parser/Parts/BruteForceSearcherTests.cs index b8e995cc..195e744d 100644 --- a/src/UglyToad.PdfPig.Tests/Parser/Parts/BruteForceSearcherTests.cs +++ b/src/UglyToad.PdfPig.Tests/Parser/Parts/BruteForceSearcherTests.cs @@ -36,7 +36,7 @@ startxref 216 %%EOF"; - private static readonly long[] TestDataOffsets = + private static readonly long[] TestDataOffsets = { TestData.IndexOf("2 17 obj", StringComparison.OrdinalIgnoreCase), TestData.IndexOf("3 0 obj", StringComparison.OrdinalIgnoreCase), @@ -57,7 +57,7 @@ startxref public void SearcherFindsCorrectObjects() { var input = new ByteArrayInputBytes(OtherEncodings.StringAsLatin1Bytes(TestData)); - + var locations = BruteForceSearcher.GetObjectLocations(input); Assert.Equal(4, locations.Count); @@ -69,11 +69,11 @@ startxref public void ReaderOnlyCallsOnce() { var reader = StringBytesTestConverter.Convert(TestData, false); - + var locations = BruteForceSearcher.GetObjectLocations(reader.Bytes); Assert.Equal(4, locations.Count); - + var newLocations = BruteForceSearcher.GetObjectLocations(reader.Bytes); Assert.Equal(4, locations.Count); @@ -132,7 +132,7 @@ endobj 5 0 obj << /IsEmpty false >> endobj"; - + var bytes = new ByteArrayInputBytes(OtherEncodings.StringAsLatin1Bytes(s)); var locations = BruteForceSearcher.GetObjectLocations(bytes); @@ -168,12 +168,35 @@ endobj"; Assert.Equal(581, locations[new IndirectReference(7, 0)]); Assert.Equal(5068, locations[new IndirectReference(8, 0)]); Assert.Equal(5091, locations[new IndirectReference(9, 0)]); - + var s = GetStringAt(bytes, locations[new IndirectReference(3, 0)]); Assert.StartsWith("3 0 obj", s); } } + [Fact] + public void BruteForceSearcherBytesFileOffsetsCorrect() + { + var bytes = new ByteArrayInputBytes(File.ReadAllBytes(IntegrationHelpers.GetDocumentPath("Single Page Simple - from inkscape.pdf"))); + + var locations = BruteForceSearcher.GetObjectLocations(bytes); + + Assert.Equal(13, locations.Count); + + Assert.Equal(6183, locations[new IndirectReference(1, 0)]); + Assert.Equal(244, locations[new IndirectReference(2, 0)]); + Assert.Equal(15, locations[new IndirectReference(3, 0)]); + Assert.Equal(222, locations[new IndirectReference(4, 0)]); + Assert.Equal(5766, locations[new IndirectReference(5, 0)]); + Assert.Equal(353, locations[new IndirectReference(6, 0)]); + Assert.Equal(581, locations[new IndirectReference(7, 0)]); + Assert.Equal(5068, locations[new IndirectReference(8, 0)]); + Assert.Equal(5091, locations[new IndirectReference(9, 0)]); + + var s = GetStringAt(bytes, locations[new IndirectReference(3, 0)]); + Assert.StartsWith("3 0 obj", s); + } + [Fact] public void BruteForceSearcherFileOffsetsCorrectOpenOffice() { diff --git a/src/UglyToad.PdfPig/Parser/Parts/BruteForceSearcher.cs b/src/UglyToad.PdfPig/Parser/Parts/BruteForceSearcher.cs index ab53777f..ee5abca5 100644 --- a/src/UglyToad.PdfPig/Parser/Parts/BruteForceSearcher.cs +++ b/src/UglyToad.PdfPig/Parser/Parts/BruteForceSearcher.cs @@ -42,6 +42,8 @@ var currentlyInObject = false; + var objBuffer = new byte[4]; + do { if (loopProtection > 1_000_000) @@ -94,17 +96,24 @@ bytes.Seek(currentOffset); - if (!ReadHelper.IsString(bytes, " obj")) + bytes.Read(objBuffer); + + if (!IsStartObjMarker(objBuffer)) { currentOffset++; continue; } // Current byte is ' '[obj] - var offset = currentOffset - 1; + var offset = currentOffset + 1; bytes.Seek(offset); + while (ReadHelper.IsWhitespace(bytes.CurrentByte) && offset >= MinimumSearchOffset) + { + bytes.Seek(--offset); + } + while (ReadHelper.IsDigit(bytes.CurrentByte) && offset >= MinimumSearchOffset) { generationBytes.Insert(0, (char)bytes.CurrentByte); @@ -113,13 +122,16 @@ } // We should now be at the space between object and generation number. - if (!ReadHelper.IsSpace(bytes.CurrentByte)) + if (!ReadHelper.IsWhitespace(bytes.CurrentByte)) { currentOffset++; continue; } - bytes.Seek(--offset); + while (ReadHelper.IsWhitespace(bytes.CurrentByte)) + { + bytes.Seek(--offset); + } while (ReadHelper.IsDigit(bytes.CurrentByte) && offset >= MinimumSearchOffset) { @@ -185,5 +197,17 @@ bytes.Seek(originalOffset); return long.MaxValue; } + + private static bool IsStartObjMarker(byte[] data) + { + if (!ReadHelper.IsWhitespace(data[0])) + { + return false; + } + + return (data[1] == 'o' || data[1] == 'O') + && (data[2] == 'b' || data[2] == 'B') + && (data[3] == 'j' || data[3] == 'J'); + } } }