fix issue with newlines in object start tokens #88

where we brute force the file and it contains newlines between object tokens we fix the parsing to prevent pseudo-infinite loops.
This commit is contained in:
Eliot Jones
2020-03-17 20:09:47 +00:00
parent 5094f9d9d0
commit 0f91017613
3 changed files with 58 additions and 11 deletions

View File

@@ -123,7 +123,7 @@
} }
/// <summary> /// <summary>
/// Determines if a character is whitespace or not. /// Determines if a character is whitespace or not, this includes newlines.
/// </summary> /// </summary>
/// <remarks> /// <remarks>
/// These values are specified in table 1 (page 12) of ISO 32000-1:2008. /// These values are specified in table 1 (page 12) of ISO 32000-1:2008.

View File

@@ -36,7 +36,7 @@ startxref
216 216
%%EOF"; %%EOF";
private static readonly long[] TestDataOffsets = private static readonly long[] TestDataOffsets =
{ {
TestData.IndexOf("2 17 obj", StringComparison.OrdinalIgnoreCase), TestData.IndexOf("2 17 obj", StringComparison.OrdinalIgnoreCase),
TestData.IndexOf("3 0 obj", StringComparison.OrdinalIgnoreCase), TestData.IndexOf("3 0 obj", StringComparison.OrdinalIgnoreCase),
@@ -57,7 +57,7 @@ startxref
public void SearcherFindsCorrectObjects() public void SearcherFindsCorrectObjects()
{ {
var input = new ByteArrayInputBytes(OtherEncodings.StringAsLatin1Bytes(TestData)); var input = new ByteArrayInputBytes(OtherEncodings.StringAsLatin1Bytes(TestData));
var locations = BruteForceSearcher.GetObjectLocations(input); var locations = BruteForceSearcher.GetObjectLocations(input);
Assert.Equal(4, locations.Count); Assert.Equal(4, locations.Count);
@@ -69,11 +69,11 @@ startxref
public void ReaderOnlyCallsOnce() public void ReaderOnlyCallsOnce()
{ {
var reader = StringBytesTestConverter.Convert(TestData, false); var reader = StringBytesTestConverter.Convert(TestData, false);
var locations = BruteForceSearcher.GetObjectLocations(reader.Bytes); var locations = BruteForceSearcher.GetObjectLocations(reader.Bytes);
Assert.Equal(4, locations.Count); Assert.Equal(4, locations.Count);
var newLocations = BruteForceSearcher.GetObjectLocations(reader.Bytes); var newLocations = BruteForceSearcher.GetObjectLocations(reader.Bytes);
Assert.Equal(4, locations.Count); Assert.Equal(4, locations.Count);
@@ -132,7 +132,7 @@ endobj
5 0 obj 5 0 obj
<< /IsEmpty false >> << /IsEmpty false >>
endobj"; endobj";
var bytes = new ByteArrayInputBytes(OtherEncodings.StringAsLatin1Bytes(s)); var bytes = new ByteArrayInputBytes(OtherEncodings.StringAsLatin1Bytes(s));
var locations = BruteForceSearcher.GetObjectLocations(bytes); var locations = BruteForceSearcher.GetObjectLocations(bytes);
@@ -168,12 +168,35 @@ endobj";
Assert.Equal(581, locations[new IndirectReference(7, 0)]); Assert.Equal(581, locations[new IndirectReference(7, 0)]);
Assert.Equal(5068, locations[new IndirectReference(8, 0)]); Assert.Equal(5068, locations[new IndirectReference(8, 0)]);
Assert.Equal(5091, locations[new IndirectReference(9, 0)]); Assert.Equal(5091, locations[new IndirectReference(9, 0)]);
var s = GetStringAt(bytes, locations[new IndirectReference(3, 0)]); var s = GetStringAt(bytes, locations[new IndirectReference(3, 0)]);
Assert.StartsWith("3 0 obj", s); Assert.StartsWith("3 0 obj", s);
} }
} }
[Fact]
public void BruteForceSearcherBytesFileOffsetsCorrect()
{
var bytes = new ByteArrayInputBytes(File.ReadAllBytes(IntegrationHelpers.GetDocumentPath("Single Page Simple - from inkscape.pdf")));
var locations = BruteForceSearcher.GetObjectLocations(bytes);
Assert.Equal(13, locations.Count);
Assert.Equal(6183, locations[new IndirectReference(1, 0)]);
Assert.Equal(244, locations[new IndirectReference(2, 0)]);
Assert.Equal(15, locations[new IndirectReference(3, 0)]);
Assert.Equal(222, locations[new IndirectReference(4, 0)]);
Assert.Equal(5766, locations[new IndirectReference(5, 0)]);
Assert.Equal(353, locations[new IndirectReference(6, 0)]);
Assert.Equal(581, locations[new IndirectReference(7, 0)]);
Assert.Equal(5068, locations[new IndirectReference(8, 0)]);
Assert.Equal(5091, locations[new IndirectReference(9, 0)]);
var s = GetStringAt(bytes, locations[new IndirectReference(3, 0)]);
Assert.StartsWith("3 0 obj", s);
}
[Fact] [Fact]
public void BruteForceSearcherFileOffsetsCorrectOpenOffice() public void BruteForceSearcherFileOffsetsCorrectOpenOffice()
{ {

View File

@@ -42,6 +42,8 @@
var currentlyInObject = false; var currentlyInObject = false;
var objBuffer = new byte[4];
do do
{ {
if (loopProtection > 1_000_000) if (loopProtection > 1_000_000)
@@ -94,17 +96,24 @@
bytes.Seek(currentOffset); bytes.Seek(currentOffset);
if (!ReadHelper.IsString(bytes, " obj")) bytes.Read(objBuffer);
if (!IsStartObjMarker(objBuffer))
{ {
currentOffset++; currentOffset++;
continue; continue;
} }
// Current byte is ' '[obj] // Current byte is ' '[obj]
var offset = currentOffset - 1; var offset = currentOffset + 1;
bytes.Seek(offset); bytes.Seek(offset);
while (ReadHelper.IsWhitespace(bytes.CurrentByte) && offset >= MinimumSearchOffset)
{
bytes.Seek(--offset);
}
while (ReadHelper.IsDigit(bytes.CurrentByte) && offset >= MinimumSearchOffset) while (ReadHelper.IsDigit(bytes.CurrentByte) && offset >= MinimumSearchOffset)
{ {
generationBytes.Insert(0, (char)bytes.CurrentByte); generationBytes.Insert(0, (char)bytes.CurrentByte);
@@ -113,13 +122,16 @@
} }
// We should now be at the space between object and generation number. // We should now be at the space between object and generation number.
if (!ReadHelper.IsSpace(bytes.CurrentByte)) if (!ReadHelper.IsWhitespace(bytes.CurrentByte))
{ {
currentOffset++; currentOffset++;
continue; continue;
} }
bytes.Seek(--offset); while (ReadHelper.IsWhitespace(bytes.CurrentByte))
{
bytes.Seek(--offset);
}
while (ReadHelper.IsDigit(bytes.CurrentByte) && offset >= MinimumSearchOffset) while (ReadHelper.IsDigit(bytes.CurrentByte) && offset >= MinimumSearchOffset)
{ {
@@ -185,5 +197,17 @@
bytes.Seek(originalOffset); bytes.Seek(originalOffset);
return long.MaxValue; return long.MaxValue;
} }
private static bool IsStartObjMarker(byte[] data)
{
if (!ReadHelper.IsWhitespace(data[0]))
{
return false;
}
return (data[1] == 'o' || data[1] == 'O')
&& (data[2] == 'b' || data[2] == 'B')
&& (data[3] == 'j' || data[3] == 'J');
}
} }
} }