mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-10-15 19:54:52 +08:00
fix bugs with reading cross reference tables.
This commit is contained in:
@@ -0,0 +1,39 @@
|
||||
namespace UglyToad.Pdf.Tests.Integration
|
||||
{
|
||||
using System;
|
||||
using System.IO;
|
||||
using Xunit;
|
||||
|
||||
public class FontSizeTestFromGoogleChromeTests
|
||||
{
|
||||
private static string GetFilename()
|
||||
{
|
||||
var documentFolder = Path.GetFullPath(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "..", "..", "..", "Integration", "Documents"));
|
||||
|
||||
return Path.Combine(documentFolder, "Font Size Test - from google chrome print pdf.pdf");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void GetsCorrectNumberOfPages()
|
||||
{
|
||||
using (var document = PdfDocument.Open(GetFilename()))
|
||||
{
|
||||
var pageCount = document.NumberOfPages;
|
||||
|
||||
Assert.Equal(1, pageCount);
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void GetsCorrectPageWidthAndHeight()
|
||||
{
|
||||
using (var document = PdfDocument.Open(GetFilename()))
|
||||
{
|
||||
var page = document.GetPage(1);
|
||||
|
||||
Assert.Equal(595, page.Width);
|
||||
Assert.Equal(842, page.Height);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@@ -10,8 +10,8 @@
|
||||
<ItemGroup>
|
||||
<None Remove="Fonts\TrueType\google-simple-doc.ttf" />
|
||||
<None Remove="Fonts\TrueType\Roboto-Regular.ttf" />
|
||||
<None Remove="Integration\Documents\Font Size Test - from google chrome print pdf.pdf" />
|
||||
<None Remove="Integration\Documents\Font Size Test - from libre office.pdf" />
|
||||
<None Remove="Integration\Documents\Font Size Text - from google chrome print pdf.pdf" />
|
||||
<None Remove="Integration\Documents\Single Page Simple - from google drive.pdf" />
|
||||
</ItemGroup>
|
||||
|
||||
@@ -25,7 +25,7 @@
|
||||
<Content Include="Integration\Documents\Font Size Test - from libre office.pdf">
|
||||
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
|
||||
</Content>
|
||||
<Content Include="Integration\Documents\Font Size Text - from google chrome print pdf.pdf">
|
||||
<Content Include="Integration\Documents\Font Size Test - from google chrome print pdf.pdf">
|
||||
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
|
||||
</Content>
|
||||
<Content Include="Integration\Documents\Single Page Simple - from google drive.pdf">
|
||||
|
@@ -71,7 +71,12 @@
|
||||
|
||||
if (offsetOrStreamNumber == null)
|
||||
{
|
||||
return CosNull.Null;
|
||||
if (isLenientParsing)
|
||||
{
|
||||
return CosNull.Null;
|
||||
}
|
||||
|
||||
throw new InvalidOperationException($"Could not locate the object {key.Number} which was not found in the cross reference table.");
|
||||
}
|
||||
|
||||
var isCompressedStreamObject = offsetOrStreamNumber <= 0;
|
||||
|
@@ -10,6 +10,9 @@
|
||||
|
||||
internal class CrossReferenceTableParser
|
||||
{
|
||||
private const string InUseEntry = "n";
|
||||
private const string FreeEntry = "f";
|
||||
|
||||
private readonly ILog log;
|
||||
private readonly CosDictionaryParser dictionaryParser;
|
||||
private readonly CosBaseParser baseParser;
|
||||
@@ -25,7 +28,8 @@
|
||||
{
|
||||
builder = null;
|
||||
|
||||
long xrefTableStartOffset = source.GetPosition();
|
||||
var tableStartOffset = source.GetPosition();
|
||||
|
||||
if (source.Peek() != 'x')
|
||||
{
|
||||
return false;
|
||||
@@ -40,6 +44,7 @@
|
||||
// check for trailer after xref
|
||||
var str = ReadHelper.ReadString(source);
|
||||
byte[] b = OtherEncodings.StringAsLatin1Bytes(str);
|
||||
|
||||
source.Rewind(b.Length);
|
||||
|
||||
if (str.StartsWith("trailer"))
|
||||
@@ -54,23 +59,25 @@
|
||||
XRefType = CrossReferenceType.Table
|
||||
};
|
||||
|
||||
// Xref tables can have multiple sections. Each starts with a starting object id and a count.
|
||||
// Tables can have multiple sections. Each starts with a starting object id and a count.
|
||||
while (true)
|
||||
{
|
||||
var currentLine = ReadHelper.ReadLine(source);
|
||||
String[] splitString = currentLine.Split(new[] { "\\s" }, StringSplitOptions.RemoveEmptyEntries);
|
||||
if (splitString.Length != 2)
|
||||
if (!TableSubsectionDefinition.TryRead(log, source, out var subsectionDefinition))
|
||||
{
|
||||
log.Warn("Unexpected XRefTable Entry: " + currentLine);
|
||||
break;
|
||||
if (isLenientParsing)
|
||||
{
|
||||
log.Warn($"Unexpected subsection definition in the cross-reference table at offset {offset}");
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new InvalidOperationException($"Unexpected subsection definition in the cross-reference table at offset {offset}");
|
||||
}
|
||||
}
|
||||
// first obj id
|
||||
long currObjID = long.Parse(splitString[0]);
|
||||
// the number of objects in the xref table
|
||||
int count = int.Parse(splitString[1]);
|
||||
|
||||
var currentObjectId = subsectionDefinition.FirstNumber;
|
||||
|
||||
ReadHelper.SkipSpaces(source);
|
||||
for (int i = 0; i < count; i++)
|
||||
for (var i = 0; i < subsectionDefinition.Count; i++)
|
||||
{
|
||||
if (source.IsEof() || ReadHelper.IsEndOfName((char)source.Peek()))
|
||||
{
|
||||
@@ -81,42 +88,47 @@
|
||||
break;
|
||||
}
|
||||
//Ignore table contents
|
||||
currentLine = ReadHelper.ReadLine(source);
|
||||
splitString = currentLine.Split(new[] { "\\s" }, StringSplitOptions.RemoveEmptyEntries);
|
||||
var currentLine = ReadHelper.ReadLine(source);
|
||||
var splitString = currentLine.Split(new[] {' '}, StringSplitOptions.RemoveEmptyEntries);
|
||||
if (splitString.Length < 3)
|
||||
{
|
||||
log.Warn("invalid xref line: " + currentLine);
|
||||
break;
|
||||
}
|
||||
|
||||
/* This supports the corrupt table as reported in
|
||||
* PDFBOX-474 (XXXX XXX XX n) */
|
||||
if (splitString[splitString.Length - 1].Equals("n"))
|
||||
// This supports the corrupt table as reported in PDFBOX-474 (XXXX XXX XX n)
|
||||
if (splitString[splitString.Length - 1].Equals(InUseEntry))
|
||||
{
|
||||
try
|
||||
{
|
||||
long currOffset = long.Parse(splitString[0]);
|
||||
if (currOffset >= xrefTableStartOffset && currOffset <= source.GetPosition())
|
||||
var objectOffset = long.Parse(splitString[0]);
|
||||
|
||||
if (objectOffset >= tableStartOffset && objectOffset <= source.GetPosition())
|
||||
{
|
||||
// PDFBOX-3923: offset points inside this table - that can't be good
|
||||
throw new InvalidOperationException("XRefTable offset " + currOffset +
|
||||
" is within xref table for " + currObjID);
|
||||
throw new InvalidOperationException(
|
||||
$"Object offset {objectOffset} is within its own cross-reference table for object {currentObjectId}");
|
||||
}
|
||||
int currGenID = int.Parse(splitString[1]);
|
||||
builder.Add(currObjID, currGenID, currOffset);
|
||||
|
||||
var generation = int.Parse(splitString[1]);
|
||||
builder.Add(currentObjectId, generation, objectOffset);
|
||||
}
|
||||
catch (FormatException e)
|
||||
{
|
||||
throw new InvalidOperationException("Bad", e);
|
||||
}
|
||||
}
|
||||
else if (!splitString[2].Equals("f"))
|
||||
else if (!splitString[2].Equals(FreeEntry))
|
||||
{
|
||||
throw new InvalidOperationException("Corrupt XRefTable Entry - ObjID:" + currObjID);
|
||||
throw new InvalidOperationException(
|
||||
$"Corrupt cross-reference table entry for object {currentObjectId}. The indicator was not 'n' or 'f' but {splitString[2]}.");
|
||||
}
|
||||
currObjID++;
|
||||
|
||||
currentObjectId++;
|
||||
|
||||
ReadHelper.SkipSpaces(source);
|
||||
}
|
||||
|
||||
ReadHelper.SkipSpaces(source);
|
||||
if (!ReadHelper.IsDigit(source))
|
||||
{
|
||||
|
@@ -45,15 +45,14 @@
|
||||
|
||||
var table = new CrossReferenceTableBuilder();
|
||||
|
||||
long prev = xrefLocation;
|
||||
long previousCrossReferenceLocation = xrefLocation;
|
||||
// ---- parse whole chain of xref tables/object streams using PREV reference
|
||||
HashSet<long> prevSet = new HashSet<long>();
|
||||
while (prev > 0)
|
||||
while (previousCrossReferenceLocation > 0)
|
||||
{
|
||||
// seek to xref table
|
||||
reader.Seek(prev);
|
||||
|
||||
// skip white spaces
|
||||
reader.Seek(previousCrossReferenceLocation);
|
||||
|
||||
ReadHelper.SkipSpaces(reader);
|
||||
|
||||
var isTable = reader.Peek() == X;
|
||||
@@ -63,7 +62,7 @@
|
||||
{
|
||||
// xref table and trailer
|
||||
// use existing parser to parse xref table
|
||||
if (!crossReferenceTableParser.TryParse(reader, prev, isLenientParsing, pool, out var tableBuilder))
|
||||
if (!crossReferenceTableParser.TryParse(reader, previousCrossReferenceLocation, isLenientParsing, pool, out var tableBuilder))
|
||||
{
|
||||
throw new InvalidOperationException($"Expected trailer object at position: {reader.GetPosition()}");
|
||||
}
|
||||
@@ -89,7 +88,7 @@
|
||||
ReadHelper.SkipSpaces(reader);
|
||||
try
|
||||
{
|
||||
streamPart = ParseCrossReferenceStream(reader, prev, pool, isLenientParsing);
|
||||
streamPart = ParseCrossReferenceStream(reader, previousCrossReferenceLocation, pool, isLenientParsing);
|
||||
}
|
||||
catch (InvalidOperationException ex)
|
||||
{
|
||||
@@ -115,46 +114,50 @@
|
||||
}
|
||||
}
|
||||
}
|
||||
prev = trailer.GetLongOrDefault(CosName.PREV);
|
||||
if (prev > 0)
|
||||
previousCrossReferenceLocation = trailer.GetLongOrDefault(CosName.PREV);
|
||||
if (previousCrossReferenceLocation > 0)
|
||||
{
|
||||
// check the xref table reference
|
||||
fixedOffset = xrefOffsetValidator.CheckXRefOffset(prev, isLenientParsing);
|
||||
if (fixedOffset > -1 && fixedOffset != prev)
|
||||
fixedOffset = xrefOffsetValidator.CheckXRefOffset(previousCrossReferenceLocation, isLenientParsing);
|
||||
if (fixedOffset > -1 && fixedOffset != previousCrossReferenceLocation)
|
||||
{
|
||||
prev = fixedOffset;
|
||||
trailer.SetLong(CosName.PREV, prev);
|
||||
previousCrossReferenceLocation = fixedOffset;
|
||||
trailer.SetLong(CosName.PREV, previousCrossReferenceLocation);
|
||||
}
|
||||
}
|
||||
|
||||
tableBuilder.Previous = tableBuilder.Dictionary.GetLongOrDefault(CosName.PREV);
|
||||
|
||||
table.Add(tableBuilder.AsCrossReferenceTablePart());
|
||||
table.Add(streamPart);
|
||||
|
||||
if (streamPart != null)
|
||||
{
|
||||
table.Add(streamPart);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// parse xref stream
|
||||
var tablePart = ParseCrossReferenceStream(reader, prev, pool, isLenientParsing);
|
||||
var tablePart = ParseCrossReferenceStream(reader, previousCrossReferenceLocation, pool, isLenientParsing);
|
||||
table.Add(tablePart);
|
||||
|
||||
prev = tablePart.Previous;
|
||||
if (prev > 0)
|
||||
previousCrossReferenceLocation = tablePart.Previous;
|
||||
if (previousCrossReferenceLocation > 0)
|
||||
{
|
||||
// check the xref table reference
|
||||
fixedOffset = xrefOffsetValidator.CheckXRefOffset(prev, isLenientParsing);
|
||||
if (fixedOffset > -1 && fixedOffset != prev)
|
||||
fixedOffset = xrefOffsetValidator.CheckXRefOffset(previousCrossReferenceLocation, isLenientParsing);
|
||||
if (fixedOffset > -1 && fixedOffset != previousCrossReferenceLocation)
|
||||
{
|
||||
prev = fixedOffset;
|
||||
tablePart.FixOffset(prev);
|
||||
previousCrossReferenceLocation = fixedOffset;
|
||||
tablePart.FixOffset(previousCrossReferenceLocation);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (prevSet.Contains(prev))
|
||||
if (prevSet.Contains(previousCrossReferenceLocation))
|
||||
{
|
||||
throw new InvalidOperationException("/Prev loop at offset " + prev);
|
||||
throw new InvalidOperationException("/Prev loop at offset " + previousCrossReferenceLocation);
|
||||
}
|
||||
prevSet.Add(prev);
|
||||
prevSet.Add(previousCrossReferenceLocation);
|
||||
}
|
||||
|
||||
var resolved = table.Build(xrefLocation, log);
|
||||
|
@@ -0,0 +1,82 @@
|
||||
namespace UglyToad.Pdf.Parser.Parts.CrossReference
|
||||
{
|
||||
using System;
|
||||
using IO;
|
||||
using Logging;
|
||||
|
||||
/// <summary>
|
||||
/// Each subsection of the cross-reference table starts with a line defining the starting object number
|
||||
/// and the count of objects in the subsection.
|
||||
/// </summary>
|
||||
/// <example>
|
||||
/// xref
|
||||
/// 12 16
|
||||
/// ...
|
||||
///
|
||||
/// Defines a table subsection that starts with object 12 and has 16 entries (12-27).
|
||||
/// </example>
|
||||
internal struct TableSubsectionDefinition
|
||||
{
|
||||
private static readonly char[] Splitters = { ' ' };
|
||||
|
||||
/// <summary>
|
||||
/// The first object number in the table.
|
||||
/// </summary>
|
||||
public long FirstNumber { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The number of consecutive objects declared in the table.
|
||||
/// </summary>
|
||||
public int Count { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Create a new <see cref="TableSubsectionDefinition"/> to define a range of consecutive objects in the cross-reference table.
|
||||
/// </summary>
|
||||
public TableSubsectionDefinition(long firstNumber, int count)
|
||||
{
|
||||
FirstNumber = firstNumber;
|
||||
Count = count;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Attempts to read the <see cref="TableSubsectionDefinition"/> from the current line of the source.
|
||||
/// </summary>
|
||||
public static bool TryRead(ILog log, IRandomAccessRead source, out TableSubsectionDefinition definition)
|
||||
{
|
||||
definition = default(TableSubsectionDefinition);
|
||||
|
||||
var line = ReadHelper.ReadLine(source);
|
||||
|
||||
var parts = line.Split(Splitters, StringSplitOptions.RemoveEmptyEntries);
|
||||
|
||||
if (parts.Length != 2)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
|
||||
var firstObjectId = long.Parse(parts[0]);
|
||||
var objectCount = int.Parse(parts[1]);
|
||||
|
||||
definition = new TableSubsectionDefinition(firstObjectId, objectCount);
|
||||
|
||||
return true;
|
||||
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
log.Error(
|
||||
$"The format for the subsection definition was invalid, expected [long] [int], instead got '{line}'", ex);
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
public override string ToString()
|
||||
{
|
||||
return $"{FirstNumber} {Count}";
|
||||
}
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user