recover from invalid cross reference position

if we are reading a cross reference offset which contains a number we assumed it was a stream object. if it's not we now brute-force the entire file looking for an 'xref' token. this should be combined with a search for cross-reference streams and should run when we read neither the numeric token or an 'xref' token but for now this fixes the observed issue.

also adds number of images to the page api to prevent consumers needing to enumerate.
This commit is contained in:
Eliot Jones
2020-01-28 18:07:05 +00:00
parent 29061b1fd2
commit 8ab2838063
4 changed files with 107 additions and 6 deletions

View File

@@ -164,6 +164,7 @@
/// <summary>
/// Whether the given string is at this position in the input.
/// Resets to the current offset once read.
/// </summary>
public static bool IsString(IInputBytes bytes, string s)
{

View File

@@ -69,6 +69,11 @@
/// </summary>
public PageSize Size { get; }
/// <summary>
/// The number of images on this page. Use <see cref="GetImages"/> to access the image contents.
/// </summary>
public int NumberOfImages => Content.NumberOfImages;
/// <summary>
/// The parsed graphics state operations in the content stream for this page.
/// </summary>

View File

@@ -30,6 +30,8 @@
public IReadOnlyList<PdfPath> Paths { get; }
public int NumberOfImages => images.Count;
internal PageContent(IReadOnlyList<IGraphicsStateOperation> graphicsStateOperations, IReadOnlyList<Letter> letters,
IReadOnlyList<PdfPath> paths,
IReadOnlyList<Union<XObjectContentRecord, InlineImage>> images,

View File

@@ -112,7 +112,7 @@
{
try
{
streamPart = ParseCrossReferenceStream(streamOffset, pdfScanner);
TryParseCrossReferenceStream(streamOffset, pdfScanner, out streamPart);
}
catch (InvalidOperationException ex)
{
@@ -156,7 +156,18 @@
tokenScanner.Seek(previousCrossReferenceLocation);
// parse xref stream
var tablePart = ParseCrossReferenceStream(previousCrossReferenceLocation, pdfScanner);
if (!TryParseCrossReferenceStream(previousCrossReferenceLocation, pdfScanner, out var tablePart))
{
if (!TryBruteForceXrefTableLocate(bytes, previousCrossReferenceLocation, out var actualOffset))
{
throw new PdfDocumentFormatException();
}
previousCrossReferenceLocation = actualOffset;
missedAttempts++;
continue;
}
table.Add(tablePart);
previousCrossReferenceLocation = tablePart.Previous;
@@ -211,8 +222,11 @@
return resolved;
}
private CrossReferenceTablePart ParseCrossReferenceStream(long objByteOffset, IPdfTokenScanner pdfScanner)
private bool TryParseCrossReferenceStream(long objByteOffset, IPdfTokenScanner pdfScanner,
out CrossReferenceTablePart xrefTablePart)
{
xrefTablePart = null;
pdfScanner.Seek(objByteOffset);
pdfScanner.MoveNext();
@@ -221,12 +235,91 @@
if (streamObjectToken == null || !(streamObjectToken.Data is StreamToken objectStream))
{
throw new PdfDocumentFormatException($"When reading a cross reference stream object found a non-stream object: {streamObjectToken?.Data}");
log.Error($"When reading a cross reference stream object found a non-stream object: {streamObjectToken?.Data}");
return false;
}
CrossReferenceTablePart xrefTablePart = crossReferenceStreamParser.Parse(objByteOffset, objectStream);
xrefTablePart = crossReferenceStreamParser.Parse(objByteOffset, objectStream);
return xrefTablePart;
return true;
}
private bool TryBruteForceXrefTableLocate(IInputBytes bytes, long expectedOffset,
out long actualOffset)
{
actualOffset = expectedOffset;
bytes.Seek(expectedOffset - 1);
var currentByte = bytes.CurrentByte;
// Forward:
while (bytes.MoveNext())
{
var previousByte = currentByte;
currentByte = bytes.CurrentByte;
if (currentByte != 'x' || !ReadHelper.IsWhitespace(previousByte))
{
continue;
}
if (!ReadHelper.IsString(bytes, "xref"))
{
continue;
}
actualOffset = bytes.CurrentOffset;
return true;
}
var lastOffset = expectedOffset - 1;
if (lastOffset < 0)
{
return false;
}
bytes.Seek(lastOffset);
var buffer = new byte[5];
while (bytes.Read(buffer) == buffer.Length)
{
for (var i = 1; i < buffer.Length; i++)
{
var p = buffer[i - 1];
var b = buffer[i];
if (b != 'x' || !ReadHelper.IsWhitespace(p))
{
continue;
}
var xLocation = lastOffset + i + 1;
bytes.Seek(xLocation);
if (ReadHelper.IsString(bytes, "xref"))
{
actualOffset = xLocation;
return true;
}
}
lastOffset -= buffer.Length;
if (lastOffset < 0)
{
break;
}
bytes.Seek(lastOffset);
}
bytes.Read(buffer);
return false;
}
}
}