fix issue with newlines in object start tokens #88

where we brute force the file and it contains newlines between object tokens we fix the parsing to prevent pseudo-infinite loops.
This commit is contained in:
Eliot Jones
2020-03-17 20:09:47 +00:00
parent 5094f9d9d0
commit 0f91017613
3 changed files with 58 additions and 11 deletions

View File

@@ -123,7 +123,7 @@
}
/// <summary>
/// Determines if a character is whitespace or not.
/// Determines if a character is whitespace or not, this includes newlines.
/// </summary>
/// <remarks>
/// These values are specified in table 1 (page 12) of ISO 32000-1:2008.

View File

@@ -36,7 +36,7 @@ startxref
216
%%EOF";
private static readonly long[] TestDataOffsets =
private static readonly long[] TestDataOffsets =
{
TestData.IndexOf("2 17 obj", StringComparison.OrdinalIgnoreCase),
TestData.IndexOf("3 0 obj", StringComparison.OrdinalIgnoreCase),
@@ -57,7 +57,7 @@ startxref
public void SearcherFindsCorrectObjects()
{
var input = new ByteArrayInputBytes(OtherEncodings.StringAsLatin1Bytes(TestData));
var locations = BruteForceSearcher.GetObjectLocations(input);
Assert.Equal(4, locations.Count);
@@ -69,11 +69,11 @@ startxref
public void ReaderOnlyCallsOnce()
{
var reader = StringBytesTestConverter.Convert(TestData, false);
var locations = BruteForceSearcher.GetObjectLocations(reader.Bytes);
Assert.Equal(4, locations.Count);
var newLocations = BruteForceSearcher.GetObjectLocations(reader.Bytes);
Assert.Equal(4, locations.Count);
@@ -132,7 +132,7 @@ endobj
5 0 obj
<< /IsEmpty false >>
endobj";
var bytes = new ByteArrayInputBytes(OtherEncodings.StringAsLatin1Bytes(s));
var locations = BruteForceSearcher.GetObjectLocations(bytes);
@@ -168,12 +168,35 @@ endobj";
Assert.Equal(581, locations[new IndirectReference(7, 0)]);
Assert.Equal(5068, locations[new IndirectReference(8, 0)]);
Assert.Equal(5091, locations[new IndirectReference(9, 0)]);
var s = GetStringAt(bytes, locations[new IndirectReference(3, 0)]);
Assert.StartsWith("3 0 obj", s);
}
}
[Fact]
public void BruteForceSearcherBytesFileOffsetsCorrect()
{
var bytes = new ByteArrayInputBytes(File.ReadAllBytes(IntegrationHelpers.GetDocumentPath("Single Page Simple - from inkscape.pdf")));
var locations = BruteForceSearcher.GetObjectLocations(bytes);
Assert.Equal(13, locations.Count);
Assert.Equal(6183, locations[new IndirectReference(1, 0)]);
Assert.Equal(244, locations[new IndirectReference(2, 0)]);
Assert.Equal(15, locations[new IndirectReference(3, 0)]);
Assert.Equal(222, locations[new IndirectReference(4, 0)]);
Assert.Equal(5766, locations[new IndirectReference(5, 0)]);
Assert.Equal(353, locations[new IndirectReference(6, 0)]);
Assert.Equal(581, locations[new IndirectReference(7, 0)]);
Assert.Equal(5068, locations[new IndirectReference(8, 0)]);
Assert.Equal(5091, locations[new IndirectReference(9, 0)]);
var s = GetStringAt(bytes, locations[new IndirectReference(3, 0)]);
Assert.StartsWith("3 0 obj", s);
}
[Fact]
public void BruteForceSearcherFileOffsetsCorrectOpenOffice()
{

View File

@@ -42,6 +42,8 @@
var currentlyInObject = false;
var objBuffer = new byte[4];
do
{
if (loopProtection > 1_000_000)
@@ -94,17 +96,24 @@
bytes.Seek(currentOffset);
if (!ReadHelper.IsString(bytes, " obj"))
bytes.Read(objBuffer);
if (!IsStartObjMarker(objBuffer))
{
currentOffset++;
continue;
}
// Current byte is ' '[obj]
var offset = currentOffset - 1;
var offset = currentOffset + 1;
bytes.Seek(offset);
while (ReadHelper.IsWhitespace(bytes.CurrentByte) && offset >= MinimumSearchOffset)
{
bytes.Seek(--offset);
}
while (ReadHelper.IsDigit(bytes.CurrentByte) && offset >= MinimumSearchOffset)
{
generationBytes.Insert(0, (char)bytes.CurrentByte);
@@ -113,13 +122,16 @@
}
// We should now be at the space between object and generation number.
if (!ReadHelper.IsSpace(bytes.CurrentByte))
if (!ReadHelper.IsWhitespace(bytes.CurrentByte))
{
currentOffset++;
continue;
}
bytes.Seek(--offset);
while (ReadHelper.IsWhitespace(bytes.CurrentByte))
{
bytes.Seek(--offset);
}
while (ReadHelper.IsDigit(bytes.CurrentByte) && offset >= MinimumSearchOffset)
{
@@ -185,5 +197,17 @@
bytes.Seek(originalOffset);
return long.MaxValue;
}
private static bool IsStartObjMarker(byte[] data)
{
if (!ReadHelper.IsWhitespace(data[0]))
{
return false;
}
return (data[1] == 'o' || data[1] == 'O')
&& (data[2] == 'b' || data[2] == 'B')
&& (data[3] == 'j' || data[3] == 'J');
}
}
}