mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-10-14 10:55:04 +08:00
recover from invalid cross reference position
if we are reading a cross reference offset which contains a number we assumed it was a stream object. if it's not we now brute-force the entire file looking for an 'xref' token. this should be combined with a search for cross-reference streams and should run when we read neither the numeric token or an 'xref' token but for now this fixes the observed issue. also adds number of images to the page api to prevent consumers needing to enumerate.
This commit is contained in:
@@ -164,6 +164,7 @@
|
||||
|
||||
/// <summary>
|
||||
/// Whether the given string is at this position in the input.
|
||||
/// Resets to the current offset once read.
|
||||
/// </summary>
|
||||
public static bool IsString(IInputBytes bytes, string s)
|
||||
{
|
||||
|
@@ -69,6 +69,11 @@
|
||||
/// </summary>
|
||||
public PageSize Size { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The number of images on this page. Use <see cref="GetImages"/> to access the image contents.
|
||||
/// </summary>
|
||||
public int NumberOfImages => Content.NumberOfImages;
|
||||
|
||||
/// <summary>
|
||||
/// The parsed graphics state operations in the content stream for this page.
|
||||
/// </summary>
|
||||
|
@@ -30,6 +30,8 @@
|
||||
|
||||
public IReadOnlyList<PdfPath> Paths { get; }
|
||||
|
||||
public int NumberOfImages => images.Count;
|
||||
|
||||
internal PageContent(IReadOnlyList<IGraphicsStateOperation> graphicsStateOperations, IReadOnlyList<Letter> letters,
|
||||
IReadOnlyList<PdfPath> paths,
|
||||
IReadOnlyList<Union<XObjectContentRecord, InlineImage>> images,
|
||||
|
@@ -112,7 +112,7 @@
|
||||
{
|
||||
try
|
||||
{
|
||||
streamPart = ParseCrossReferenceStream(streamOffset, pdfScanner);
|
||||
TryParseCrossReferenceStream(streamOffset, pdfScanner, out streamPart);
|
||||
}
|
||||
catch (InvalidOperationException ex)
|
||||
{
|
||||
@@ -156,7 +156,18 @@
|
||||
tokenScanner.Seek(previousCrossReferenceLocation);
|
||||
|
||||
// parse xref stream
|
||||
var tablePart = ParseCrossReferenceStream(previousCrossReferenceLocation, pdfScanner);
|
||||
if (!TryParseCrossReferenceStream(previousCrossReferenceLocation, pdfScanner, out var tablePart))
|
||||
{
|
||||
if (!TryBruteForceXrefTableLocate(bytes, previousCrossReferenceLocation, out var actualOffset))
|
||||
{
|
||||
throw new PdfDocumentFormatException();
|
||||
}
|
||||
|
||||
previousCrossReferenceLocation = actualOffset;
|
||||
missedAttempts++;
|
||||
continue;
|
||||
}
|
||||
|
||||
table.Add(tablePart);
|
||||
|
||||
previousCrossReferenceLocation = tablePart.Previous;
|
||||
@@ -211,8 +222,11 @@
|
||||
return resolved;
|
||||
}
|
||||
|
||||
private CrossReferenceTablePart ParseCrossReferenceStream(long objByteOffset, IPdfTokenScanner pdfScanner)
|
||||
private bool TryParseCrossReferenceStream(long objByteOffset, IPdfTokenScanner pdfScanner,
|
||||
out CrossReferenceTablePart xrefTablePart)
|
||||
{
|
||||
xrefTablePart = null;
|
||||
|
||||
pdfScanner.Seek(objByteOffset);
|
||||
|
||||
pdfScanner.MoveNext();
|
||||
@@ -221,12 +235,91 @@
|
||||
|
||||
if (streamObjectToken == null || !(streamObjectToken.Data is StreamToken objectStream))
|
||||
{
|
||||
throw new PdfDocumentFormatException($"When reading a cross reference stream object found a non-stream object: {streamObjectToken?.Data}");
|
||||
log.Error($"When reading a cross reference stream object found a non-stream object: {streamObjectToken?.Data}");
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
CrossReferenceTablePart xrefTablePart = crossReferenceStreamParser.Parse(objByteOffset, objectStream);
|
||||
xrefTablePart = crossReferenceStreamParser.Parse(objByteOffset, objectStream);
|
||||
|
||||
return xrefTablePart;
|
||||
return true;
|
||||
}
|
||||
|
||||
private bool TryBruteForceXrefTableLocate(IInputBytes bytes, long expectedOffset,
|
||||
out long actualOffset)
|
||||
{
|
||||
actualOffset = expectedOffset;
|
||||
|
||||
bytes.Seek(expectedOffset - 1);
|
||||
var currentByte = bytes.CurrentByte;
|
||||
|
||||
// Forward:
|
||||
while (bytes.MoveNext())
|
||||
{
|
||||
var previousByte = currentByte;
|
||||
currentByte = bytes.CurrentByte;
|
||||
|
||||
if (currentByte != 'x' || !ReadHelper.IsWhitespace(previousByte))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!ReadHelper.IsString(bytes, "xref"))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
actualOffset = bytes.CurrentOffset;
|
||||
return true;
|
||||
}
|
||||
|
||||
var lastOffset = expectedOffset - 1;
|
||||
|
||||
if (lastOffset < 0)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
bytes.Seek(lastOffset);
|
||||
|
||||
var buffer = new byte[5];
|
||||
|
||||
while (bytes.Read(buffer) == buffer.Length)
|
||||
{
|
||||
for (var i = 1; i < buffer.Length; i++)
|
||||
{
|
||||
var p = buffer[i - 1];
|
||||
var b = buffer[i];
|
||||
|
||||
if (b != 'x' || !ReadHelper.IsWhitespace(p))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
var xLocation = lastOffset + i + 1;
|
||||
|
||||
bytes.Seek(xLocation);
|
||||
|
||||
if (ReadHelper.IsString(bytes, "xref"))
|
||||
{
|
||||
actualOffset = xLocation;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
lastOffset -= buffer.Length;
|
||||
if (lastOffset < 0)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
bytes.Seek(lastOffset);
|
||||
}
|
||||
bytes.Read(buffer);
|
||||
|
||||
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user