mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-08-20 06:57:47 +08:00
Try to repair xref offset by looking for all startxref and fix #1040
This commit is contained in:
parent
bf7c3c01d0
commit
6911f31b49
@ -189,7 +189,34 @@
|
||||
|
||||
return found;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Whether the given string is at this position in the input.
|
||||
/// Resets to the current offset once read.
|
||||
/// </summary>
|
||||
public static bool IsString(IInputBytes bytes, ReadOnlySpan<byte> s)
|
||||
{
|
||||
bool found = true;
|
||||
|
||||
var startOffset = bytes.CurrentOffset;
|
||||
|
||||
foreach (var c in s)
|
||||
{
|
||||
if (bytes.CurrentByte != c)
|
||||
{
|
||||
found = false;
|
||||
break;
|
||||
}
|
||||
|
||||
bytes.MoveNext();
|
||||
}
|
||||
|
||||
bytes.Seek(startOffset);
|
||||
|
||||
return found;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Read a long from the input.
|
||||
/// </summary>
|
||||
@ -252,14 +279,6 @@
|
||||
throw new PdfDocumentFormatException($"Error: Expected an integer type at offset {bytes.CurrentOffset}, instead got \'{OtherEncodings.BytesAsLatin1String(intBytes)}\'");
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Whether the given character is a space.
|
||||
/// </summary>
|
||||
public static bool IsSpace(int c)
|
||||
{
|
||||
return c == ' ';
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Whether the given character value is a valid hex value.
|
||||
|
@ -7,6 +7,21 @@
|
||||
|
||||
public class GithubIssuesTests
|
||||
{
|
||||
[Fact]
|
||||
public void Issue1040()
|
||||
{
|
||||
var path = IntegrationHelpers.GetSpecificTestDocumentPath("pdfpig-issue-1040.pdf");
|
||||
|
||||
using (var document = PdfDocument.Open(path, new ParsingOptions() { UseLenientParsing = true}))
|
||||
{
|
||||
var page1 = document.GetPage(1);
|
||||
Assert.NotEmpty(page1.Letters);
|
||||
|
||||
var page2 = document.GetPage(2);
|
||||
Assert.NotEmpty(page2.Letters);
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Issue1013()
|
||||
{
|
||||
|
Binary file not shown.
@ -1,7 +1,6 @@
|
||||
namespace UglyToad.PdfPig.Parser.FileStructure
|
||||
{
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using Core;
|
||||
using Tokenization.Scanner;
|
||||
using Tokens;
|
||||
@ -26,7 +25,7 @@
|
||||
/// </summary>
|
||||
private const int EndOfFileSearchRange = 2048;
|
||||
|
||||
private static ReadOnlySpan<byte> StartXRefBytes => "startxref"u8;
|
||||
internal static ReadOnlySpan<byte> StartXRefBytes => "startxref"u8;
|
||||
|
||||
public static long GetFirstCrossReferenceOffset(IInputBytes bytes, ISeekableTokenScanner scanner, bool isLenientParsing)
|
||||
{
|
||||
|
@ -7,12 +7,16 @@
|
||||
using Tokenization.Scanner;
|
||||
using Tokens;
|
||||
|
||||
internal class XrefOffsetValidator
|
||||
internal sealed class XrefOffsetValidator
|
||||
{
|
||||
private static readonly long MinimumSearchOffset = 6;
|
||||
private const long MinimumSearchOffset = 6;
|
||||
|
||||
private static ReadOnlySpan<byte> XRefBytes => "xref"u8;
|
||||
private static ReadOnlySpan<byte> SpaceObjBytes => " obj"u8;
|
||||
|
||||
private readonly ILog log;
|
||||
|
||||
private List<long>? bfSearchStartXRefTablesOffsets;
|
||||
private List<long>? bfSearchXRefTablesOffsets;
|
||||
private List<long>? bfSearchXRefStreamsOffsets;
|
||||
|
||||
@ -90,16 +94,18 @@
|
||||
|
||||
BfSearchForXRefStreams(reader);
|
||||
|
||||
if (bfSearchXRefTablesOffsets != null)
|
||||
if (bfSearchXRefTablesOffsets != null && bfSearchXRefTablesOffsets.Count > 0)
|
||||
{
|
||||
// TODO to be optimized, this won't work in every case
|
||||
newOffsetTable = SearchNearestValue(bfSearchXRefTablesOffsets, xrefOffset);
|
||||
}
|
||||
if (bfSearchXRefStreamsOffsets != null)
|
||||
|
||||
if (bfSearchXRefStreamsOffsets != null && bfSearchXRefStreamsOffsets.Count > 0)
|
||||
{
|
||||
// TODO to be optimized, this won't work in every case
|
||||
newOffsetStream = SearchNearestValue(bfSearchXRefStreamsOffsets, xrefOffset);
|
||||
}
|
||||
|
||||
// choose the nearest value
|
||||
if (newOffsetTable > -1 && newOffsetStream > -1)
|
||||
{
|
||||
@ -126,9 +132,91 @@
|
||||
newOffset = newOffsetStream;
|
||||
bfSearchXRefStreamsOffsets!.Remove(newOffsetStream);
|
||||
}
|
||||
else
|
||||
{
|
||||
log.Warn("Trying to repair xref offset by looking for all startxref.");
|
||||
if (TryBruteForceSearchForXrefFromStartxref(xrefOffset, scanner, reader, out long newOffsetFromStartxref))
|
||||
{
|
||||
newOffset = newOffsetFromStartxref;
|
||||
}
|
||||
}
|
||||
|
||||
return newOffset;
|
||||
}
|
||||
|
||||
private bool TryBruteForceSearchForXrefFromStartxref(long xrefOffset, ISeekableTokenScanner scanner, IInputBytes reader, out long newOffset)
|
||||
{
|
||||
newOffset = -1;
|
||||
BruteForceSearchForStartxref(reader);
|
||||
long newStartXRefOffset = SearchNearestValue(bfSearchStartXRefTablesOffsets, xrefOffset);
|
||||
if (newStartXRefOffset < reader.Length)
|
||||
{
|
||||
long tempNewOffset = -1;
|
||||
var startOffset = scanner.CurrentPosition;
|
||||
scanner.Seek(newStartXRefOffset + 9);
|
||||
|
||||
if (scanner.MoveNext() && scanner.CurrentToken is NumericToken token)
|
||||
{
|
||||
tempNewOffset = token.Long;
|
||||
}
|
||||
|
||||
if (tempNewOffset > -1)
|
||||
{
|
||||
scanner.Seek(tempNewOffset);
|
||||
scanner.MoveNext();
|
||||
if (ReferenceEquals(scanner.CurrentToken, OperatorToken.Xref))
|
||||
{
|
||||
newOffset = tempNewOffset;
|
||||
}
|
||||
|
||||
if (CheckXRefStreamOffset(tempNewOffset, scanner, true))
|
||||
{
|
||||
newOffset = tempNewOffset;
|
||||
}
|
||||
}
|
||||
|
||||
scanner.Seek(startOffset);
|
||||
}
|
||||
|
||||
return newOffset != -1;
|
||||
}
|
||||
|
||||
private void BruteForceSearchForStartxref(IInputBytes bytes)
|
||||
{
|
||||
if (bfSearchStartXRefTablesOffsets != null)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// a pdf may contain more than one startxref entry
|
||||
bfSearchStartXRefTablesOffsets = new List<long>();
|
||||
|
||||
var startOffset = bytes.CurrentOffset;
|
||||
|
||||
bytes.Seek(MinimumSearchOffset);
|
||||
|
||||
// search for startxref
|
||||
while (bytes.MoveNext() && !bytes.IsAtEnd())
|
||||
{
|
||||
if (ReadHelper.IsString(bytes, FileTrailerParser.StartXRefBytes))
|
||||
{
|
||||
var newOffset = bytes.CurrentOffset;
|
||||
|
||||
bytes.Seek(newOffset - 1);
|
||||
|
||||
if (ReadHelper.IsWhitespace(bytes.CurrentByte))
|
||||
{
|
||||
bfSearchStartXRefTablesOffsets.Add(newOffset);
|
||||
}
|
||||
|
||||
bytes.Seek(newOffset + 9);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
bytes.Seek(startOffset);
|
||||
}
|
||||
|
||||
private void BruteForceSearchForTables(IInputBytes bytes)
|
||||
{
|
||||
if (bfSearchXRefTablesOffsets != null)
|
||||
@ -146,7 +234,7 @@
|
||||
// search for xref tables
|
||||
while (bytes.MoveNext() && !bytes.IsAtEnd())
|
||||
{
|
||||
if (ReadHelper.IsString(bytes, "xref"))
|
||||
if (ReadHelper.IsString(bytes, XRefBytes))
|
||||
{
|
||||
var newOffset = bytes.CurrentOffset;
|
||||
|
||||
@ -180,11 +268,9 @@
|
||||
bytes.Seek(MinimumSearchOffset);
|
||||
|
||||
// search for XRef streams
|
||||
var objString = " obj";
|
||||
|
||||
while (bytes.MoveNext() && !bytes.IsAtEnd())
|
||||
{
|
||||
if (!ReadHelper.IsString(bytes, "xref"))
|
||||
if (!ReadHelper.IsString(bytes, XRefBytes))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
@ -209,7 +295,7 @@
|
||||
|
||||
for (int j = 0; j < 10; j++)
|
||||
{
|
||||
if (ReadHelper.IsString(bytes, objString))
|
||||
if (ReadHelper.IsString(bytes, SpaceObjBytes))
|
||||
{
|
||||
long tempOffset = currentOffset - 1;
|
||||
|
||||
@ -224,7 +310,7 @@
|
||||
bytes.Seek(tempOffset);
|
||||
|
||||
// is the digit preceded by a space?
|
||||
if (ReadHelper.IsSpace(bytes.CurrentByte))
|
||||
if (ReadHelper.IsWhitespace(bytes.CurrentByte))
|
||||
{
|
||||
int length = 0;
|
||||
bytes.Seek(--tempOffset);
|
||||
|
Loading…
Reference in New Issue
Block a user