fix bugs with reading cross reference tables.

This commit is contained in:
Eliot Jones
2017-12-30 12:56:46 +00:00
parent f869bba72c
commit 6adc0c169d
7 changed files with 194 additions and 53 deletions

View File

@@ -0,0 +1,39 @@
namespace UglyToad.Pdf.Tests.Integration
{
using System;
using System.IO;
using Xunit;
public class FontSizeTestFromGoogleChromeTests
{
private static string GetFilename()
{
var documentFolder = Path.GetFullPath(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "..", "..", "..", "Integration", "Documents"));
return Path.Combine(documentFolder, "Font Size Test - from google chrome print pdf.pdf");
}
[Fact]
public void GetsCorrectNumberOfPages()
{
using (var document = PdfDocument.Open(GetFilename()))
{
var pageCount = document.NumberOfPages;
Assert.Equal(1, pageCount);
}
}
[Fact]
public void GetsCorrectPageWidthAndHeight()
{
using (var document = PdfDocument.Open(GetFilename()))
{
var page = document.GetPage(1);
Assert.Equal(595, page.Width);
Assert.Equal(842, page.Height);
}
}
}
}

View File

@@ -10,8 +10,8 @@
<ItemGroup>
<None Remove="Fonts\TrueType\google-simple-doc.ttf" />
<None Remove="Fonts\TrueType\Roboto-Regular.ttf" />
<None Remove="Integration\Documents\Font Size Test - from google chrome print pdf.pdf" />
<None Remove="Integration\Documents\Font Size Test - from libre office.pdf" />
<None Remove="Integration\Documents\Font Size Text - from google chrome print pdf.pdf" />
<None Remove="Integration\Documents\Single Page Simple - from google drive.pdf" />
</ItemGroup>
@@ -25,7 +25,7 @@
<Content Include="Integration\Documents\Font Size Test - from libre office.pdf">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</Content>
<Content Include="Integration\Documents\Font Size Text - from google chrome print pdf.pdf">
<Content Include="Integration\Documents\Font Size Test - from google chrome print pdf.pdf">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</Content>
<Content Include="Integration\Documents\Single Page Simple - from google drive.pdf">

View File

@@ -71,7 +71,12 @@
if (offsetOrStreamNumber == null)
{
return CosNull.Null;
if (isLenientParsing)
{
return CosNull.Null;
}
throw new InvalidOperationException($"Could not locate the object {key.Number} which was not found in the cross reference table.");
}
var isCompressedStreamObject = offsetOrStreamNumber <= 0;

View File

@@ -10,6 +10,9 @@
internal class CrossReferenceTableParser
{
private const string InUseEntry = "n";
private const string FreeEntry = "f";
private readonly ILog log;
private readonly CosDictionaryParser dictionaryParser;
private readonly CosBaseParser baseParser;
@@ -25,7 +28,8 @@
{
builder = null;
long xrefTableStartOffset = source.GetPosition();
var tableStartOffset = source.GetPosition();
if (source.Peek() != 'x')
{
return false;
@@ -40,6 +44,7 @@
// check for trailer after xref
var str = ReadHelper.ReadString(source);
byte[] b = OtherEncodings.StringAsLatin1Bytes(str);
source.Rewind(b.Length);
if (str.StartsWith("trailer"))
@@ -54,23 +59,25 @@
XRefType = CrossReferenceType.Table
};
// Xref tables can have multiple sections. Each starts with a starting object id and a count.
// Tables can have multiple sections. Each starts with a starting object id and a count.
while (true)
{
var currentLine = ReadHelper.ReadLine(source);
String[] splitString = currentLine.Split(new[] { "\\s" }, StringSplitOptions.RemoveEmptyEntries);
if (splitString.Length != 2)
if (!TableSubsectionDefinition.TryRead(log, source, out var subsectionDefinition))
{
log.Warn("Unexpected XRefTable Entry: " + currentLine);
break;
if (isLenientParsing)
{
log.Warn($"Unexpected subsection definition in the cross-reference table at offset {offset}");
}
else
{
throw new InvalidOperationException($"Unexpected subsection definition in the cross-reference table at offset {offset}");
}
}
// first obj id
long currObjID = long.Parse(splitString[0]);
// the number of objects in the xref table
int count = int.Parse(splitString[1]);
var currentObjectId = subsectionDefinition.FirstNumber;
ReadHelper.SkipSpaces(source);
for (int i = 0; i < count; i++)
for (var i = 0; i < subsectionDefinition.Count; i++)
{
if (source.IsEof() || ReadHelper.IsEndOfName((char)source.Peek()))
{
@@ -81,42 +88,47 @@
break;
}
//Ignore table contents
currentLine = ReadHelper.ReadLine(source);
splitString = currentLine.Split(new[] { "\\s" }, StringSplitOptions.RemoveEmptyEntries);
var currentLine = ReadHelper.ReadLine(source);
var splitString = currentLine.Split(new[] {' '}, StringSplitOptions.RemoveEmptyEntries);
if (splitString.Length < 3)
{
log.Warn("invalid xref line: " + currentLine);
break;
}
/* This supports the corrupt table as reported in
* PDFBOX-474 (XXXX XXX XX n) */
if (splitString[splitString.Length - 1].Equals("n"))
// This supports the corrupt table as reported in PDFBOX-474 (XXXX XXX XX n)
if (splitString[splitString.Length - 1].Equals(InUseEntry))
{
try
{
long currOffset = long.Parse(splitString[0]);
if (currOffset >= xrefTableStartOffset && currOffset <= source.GetPosition())
var objectOffset = long.Parse(splitString[0]);
if (objectOffset >= tableStartOffset && objectOffset <= source.GetPosition())
{
// PDFBOX-3923: offset points inside this table - that can't be good
throw new InvalidOperationException("XRefTable offset " + currOffset +
" is within xref table for " + currObjID);
throw new InvalidOperationException(
$"Object offset {objectOffset} is within its own cross-reference table for object {currentObjectId}");
}
int currGenID = int.Parse(splitString[1]);
builder.Add(currObjID, currGenID, currOffset);
var generation = int.Parse(splitString[1]);
builder.Add(currentObjectId, generation, objectOffset);
}
catch (FormatException e)
{
throw new InvalidOperationException("Bad", e);
}
}
else if (!splitString[2].Equals("f"))
else if (!splitString[2].Equals(FreeEntry))
{
throw new InvalidOperationException("Corrupt XRefTable Entry - ObjID:" + currObjID);
throw new InvalidOperationException(
$"Corrupt cross-reference table entry for object {currentObjectId}. The indicator was not 'n' or 'f' but {splitString[2]}.");
}
currObjID++;
currentObjectId++;
ReadHelper.SkipSpaces(source);
}
ReadHelper.SkipSpaces(source);
if (!ReadHelper.IsDigit(source))
{

View File

@@ -45,15 +45,14 @@
var table = new CrossReferenceTableBuilder();
long prev = xrefLocation;
long previousCrossReferenceLocation = xrefLocation;
// ---- parse whole chain of xref tables/object streams using PREV reference
HashSet<long> prevSet = new HashSet<long>();
while (prev > 0)
while (previousCrossReferenceLocation > 0)
{
// seek to xref table
reader.Seek(prev);
// skip white spaces
reader.Seek(previousCrossReferenceLocation);
ReadHelper.SkipSpaces(reader);
var isTable = reader.Peek() == X;
@@ -63,7 +62,7 @@
{
// xref table and trailer
// use existing parser to parse xref table
if (!crossReferenceTableParser.TryParse(reader, prev, isLenientParsing, pool, out var tableBuilder))
if (!crossReferenceTableParser.TryParse(reader, previousCrossReferenceLocation, isLenientParsing, pool, out var tableBuilder))
{
throw new InvalidOperationException($"Expected trailer object at position: {reader.GetPosition()}");
}
@@ -89,7 +88,7 @@
ReadHelper.SkipSpaces(reader);
try
{
streamPart = ParseCrossReferenceStream(reader, prev, pool, isLenientParsing);
streamPart = ParseCrossReferenceStream(reader, previousCrossReferenceLocation, pool, isLenientParsing);
}
catch (InvalidOperationException ex)
{
@@ -115,46 +114,50 @@
}
}
}
prev = trailer.GetLongOrDefault(CosName.PREV);
if (prev > 0)
previousCrossReferenceLocation = trailer.GetLongOrDefault(CosName.PREV);
if (previousCrossReferenceLocation > 0)
{
// check the xref table reference
fixedOffset = xrefOffsetValidator.CheckXRefOffset(prev, isLenientParsing);
if (fixedOffset > -1 && fixedOffset != prev)
fixedOffset = xrefOffsetValidator.CheckXRefOffset(previousCrossReferenceLocation, isLenientParsing);
if (fixedOffset > -1 && fixedOffset != previousCrossReferenceLocation)
{
prev = fixedOffset;
trailer.SetLong(CosName.PREV, prev);
previousCrossReferenceLocation = fixedOffset;
trailer.SetLong(CosName.PREV, previousCrossReferenceLocation);
}
}
tableBuilder.Previous = tableBuilder.Dictionary.GetLongOrDefault(CosName.PREV);
table.Add(tableBuilder.AsCrossReferenceTablePart());
table.Add(streamPart);
if (streamPart != null)
{
table.Add(streamPart);
}
}
else
{
// parse xref stream
var tablePart = ParseCrossReferenceStream(reader, prev, pool, isLenientParsing);
var tablePart = ParseCrossReferenceStream(reader, previousCrossReferenceLocation, pool, isLenientParsing);
table.Add(tablePart);
prev = tablePart.Previous;
if (prev > 0)
previousCrossReferenceLocation = tablePart.Previous;
if (previousCrossReferenceLocation > 0)
{
// check the xref table reference
fixedOffset = xrefOffsetValidator.CheckXRefOffset(prev, isLenientParsing);
if (fixedOffset > -1 && fixedOffset != prev)
fixedOffset = xrefOffsetValidator.CheckXRefOffset(previousCrossReferenceLocation, isLenientParsing);
if (fixedOffset > -1 && fixedOffset != previousCrossReferenceLocation)
{
prev = fixedOffset;
tablePart.FixOffset(prev);
previousCrossReferenceLocation = fixedOffset;
tablePart.FixOffset(previousCrossReferenceLocation);
}
}
}
if (prevSet.Contains(prev))
if (prevSet.Contains(previousCrossReferenceLocation))
{
throw new InvalidOperationException("/Prev loop at offset " + prev);
throw new InvalidOperationException("/Prev loop at offset " + previousCrossReferenceLocation);
}
prevSet.Add(prev);
prevSet.Add(previousCrossReferenceLocation);
}
var resolved = table.Build(xrefLocation, log);

View File

@@ -0,0 +1,82 @@
namespace UglyToad.Pdf.Parser.Parts.CrossReference
{
using System;
using IO;
using Logging;
/// <summary>
/// Each subsection of the cross-reference table starts with a line defining the starting object number
/// and the count of objects in the subsection.
/// </summary>
/// <example>
/// xref
/// 12 16
/// ...
///
/// Defines a table subsection that starts with object 12 and has 16 entries (12-27).
/// </example>
internal struct TableSubsectionDefinition
{
private static readonly char[] Splitters = { ' ' };
/// <summary>
/// The first object number in the table.
/// </summary>
public long FirstNumber { get; }
/// <summary>
/// The number of consecutive objects declared in the table.
/// </summary>
public int Count { get; }
/// <summary>
/// Create a new <see cref="TableSubsectionDefinition"/> to define a range of consecutive objects in the cross-reference table.
/// </summary>
public TableSubsectionDefinition(long firstNumber, int count)
{
FirstNumber = firstNumber;
Count = count;
}
/// <summary>
/// Attempts to read the <see cref="TableSubsectionDefinition"/> from the current line of the source.
/// </summary>
public static bool TryRead(ILog log, IRandomAccessRead source, out TableSubsectionDefinition definition)
{
definition = default(TableSubsectionDefinition);
var line = ReadHelper.ReadLine(source);
var parts = line.Split(Splitters, StringSplitOptions.RemoveEmptyEntries);
if (parts.Length != 2)
{
return false;
}
try
{
var firstObjectId = long.Parse(parts[0]);
var objectCount = int.Parse(parts[1]);
definition = new TableSubsectionDefinition(firstObjectId, objectCount);
return true;
}
catch (Exception ex)
{
log.Error(
$"The format for the subsection definition was invalid, expected [long] [int], instead got '{line}'", ex);
return false;
}
}
public override string ToString()
{
return $"{FirstNumber} {Count}";
}
}
}