fix #176, allow startxref to appear earlier in the document

This commit is contained in:
Eliot Jones
2020-05-31 17:01:38 +01:00
parent 4312aa470e
commit bf45602ac5
2 changed files with 51 additions and 29 deletions

View File

@@ -90,5 +90,21 @@
Assert.False(document.TryGetBookmarks(out _)); Assert.False(document.TryGetBookmarks(out _));
} }
} }
[Fact]
public void StartXRefNotNearEnd()
{
var bytes = File.ReadAllBytes(GetFilename());
var emptyTrailer = new byte[2026];
emptyTrailer[0] = 10;
bytes = bytes.Concat(emptyTrailer).ToArray();
using (var document = PdfDocument.Open(bytes, ParsingOptions.LenientParsingOff))
{
Assert.Equal(1, document.NumberOfPages);
}
}
} }
} }

View File

@@ -38,7 +38,7 @@
(byte) 'e', (byte) 'e',
(byte) 'f' (byte) 'f'
}; };
public static long GetFirstCrossReferenceOffset(IInputBytes bytes, ISeekableTokenScanner scanner, bool isLenientParsing) public static long GetFirstCrossReferenceOffset(IInputBytes bytes, ISeekableTokenScanner scanner, bool isLenientParsing)
{ {
if (bytes == null) if (bytes == null)
@@ -55,10 +55,6 @@
var offsetFromEnd = fileLength < EndOfFileSearchRange ? (int)fileLength : EndOfFileSearchRange; var offsetFromEnd = fileLength < EndOfFileSearchRange ? (int)fileLength : EndOfFileSearchRange;
var startPosition = fileLength - offsetFromEnd;
bytes.Seek(startPosition);
var startXrefPosition = GetStartXrefPosition(bytes, offsetFromEnd); var startXrefPosition = GetStartXrefPosition(bytes, offsetFromEnd);
scanner.Seek(startXrefPosition); scanner.Seek(startXrefPosition);
@@ -96,38 +92,48 @@
var startXrefs = new List<int>(); var startXrefs = new List<int>();
var index = 0; var index = 0;
var offset = 0;
var fileLength = bytes.Length;
// Starting scanning the last 1024 bytes. var multiple = 1;
while (bytes.MoveNext())
var actualStartOffset = Math.Max(0, fileLength - (offsetFromEnd * multiple));
do
{ {
offset++; multiple *= 2;
if (bytes.CurrentByte == StartXRefBytes[index]) bytes.Seek(actualStartOffset);
// Starting scanning the file bytes.
while (bytes.MoveNext())
{ {
// We might be reading "startxref". if (bytes.CurrentByte == StartXRefBytes[index])
index++; {
} // We might be reading "startxref".
else index++;
{ }
index = 0; else
{
index = 0;
}
if (index == StartXRefBytes.Length)
{
// Add this "startxref" (position from the start of the document to the first 's').
startXrefs.Add((int)bytes.CurrentOffset - StartXRefBytes.Length);
// Continue scanning in case there are further "startxref"s. Not sure if this ever happens.
index = 0;
}
} }
if (index == StartXRefBytes.Length) actualStartOffset = Math.Max(0, fileLength - (offsetFromEnd * multiple));
{ } while (startXrefs.Count == 0 && actualStartOffset > 0);
// Add this "startxref" (position from the end of the document to the first 's').
startXrefs.Add(offsetFromEnd - (offset - StartXRefBytes.Length));
// Continue scanning in case there are further "startxref"s. Not sure if this ever happens.
index = 0;
}
}
if (startXrefs.Count == 0) if (startXrefs.Count == 0)
{ {
throw new PdfDocumentFormatException("Could not find the startxref within the last 1024 characters."); throw new PdfDocumentFormatException($"Could not find the startxref within the last {offsetFromEnd} characters.");
} }
return bytes.Length - startXrefs[startXrefs.Count - 1]; return startXrefs[startXrefs.Count - 1];
} }
} }
} }