#434 ensure companion stream is added to cross reference on building

This commit is contained in:
Eliot Jones
2022-04-02 15:58:22 -04:00
parent abcbdc55e3
commit 2fd46571b3
6 changed files with 82 additions and 51 deletions

View File

@@ -1,39 +1,39 @@
namespace UglyToad.PdfPig.Tests.Integration namespace UglyToad.PdfPig.Tests.Integration
{ {
//using System; using System;
//using System.Diagnostics; using System.Diagnostics;
//using System.IO; using System.IO;
//using Xunit; using Xunit;
/// <summary> /// <summary>
/// A class for testing files which are not checked in to source control. /// A class for testing files which are not checked in to source control.
/// </summary> /// </summary>
public class LocalTests public class LocalTests
{ {
//[Fact] [Fact]
//public void Tests() public void Tests()
//{ {
// var files = Directory.GetFiles(@"C:\temp\pdfs", "*.pdf"); var files = Directory.GetFiles(@"C:\temp\pdfs", "*.pdf");
// foreach (var file in files) foreach (var file in files)
// { {
// try try
// { {
// using (var document = PdfDocument.Open(file, new ParsingOptions { UseLenientParsing = false })) using (var document = PdfDocument.Open(file, new ParsingOptions { UseLenientParsing = false }))
// { {
// for (var i = 1; i <= document.NumberOfPages; i++) for (var i = 1; i <= document.NumberOfPages; i++)
// { {
// var page = document.GetPage(i); var page = document.GetPage(i);
// var text = page.Text; var text = page.Text;
// Trace.WriteLine(text); Trace.WriteLine(text);
// } }
// } }
// } }
// catch (Exception ex) catch (Exception ex)
// { {
// throw new InvalidOperationException($"Error parsing: {Path.GetFileName(file)}.", ex); throw new InvalidOperationException($"Error parsing: {Path.GetFileName(file)}.", ex);
// } }
// } }
//} }
} }
} }

View File

@@ -34,28 +34,35 @@
DictionaryToken trailerDictionary = new DictionaryToken(new Dictionary<NameToken, IToken>()); DictionaryToken trailerDictionary = new DictionaryToken(new Dictionary<NameToken, IToken>());
Dictionary<IndirectReference, long> objectOffsets = new Dictionary<IndirectReference, long>(); Dictionary<IndirectReference, long> objectOffsets = new Dictionary<IndirectReference, long>();
List<long> xrefSeqBytePos = new List<long>(); var xrefPartToBytePositionOrder = new List<long>();
var currentPart = parts.FirstOrDefault(x => x.Offset == firstCrossReferenceOffset); var currentPart = parts.FirstOrDefault(x => x.Offset == firstCrossReferenceOffset);
if (currentPart == null) if (currentPart == null)
{ {
// no XRef at given position // no XRef at given position
log.Warn("Did not found XRef object at specified startxref position " + firstCrossReferenceOffset); log.Warn($"Did not find an XRef object at the specified startxref position {firstCrossReferenceOffset}");
// use all objects in byte position order (last entries overwrite previous ones) // use all objects in byte position order (last entries overwrite previous ones)
xrefSeqBytePos.AddRange(parts.Select(x => x.Offset)); xrefPartToBytePositionOrder.AddRange(parts.Select(x => x.Offset));
xrefSeqBytePos.Sort(); xrefPartToBytePositionOrder.Sort();
} }
else else
{ {
// copy xref type // copy xref type
type = currentPart.Type; type = currentPart.Type;
// found starting Xref object // found starting Xref object
// add this and follow chain defined by 'Prev' keys // add this and follow chain defined by 'Prev' keys
xrefSeqBytePos.Add(firstCrossReferenceOffset); xrefPartToBytePositionOrder.Add(firstCrossReferenceOffset);
// Get any streams that are tied to this table.
var activePart = currentPart;
var dependents = parts.Where(x => x.TiedToXrefAtOffset == activePart.Offset);
foreach (var dependent in dependents)
{
xrefPartToBytePositionOrder.Add(dependent.Offset);
}
while (currentPart.Dictionary != null) while (currentPart.Dictionary != null)
{ {
@@ -72,21 +79,21 @@
break; break;
} }
xrefSeqBytePos.Add(prevBytePos); xrefPartToBytePositionOrder.Add(prevBytePos);
// sanity check to prevent infinite loops // sanity check to prevent infinite loops
if (xrefSeqBytePos.Count >= parts.Count) if (xrefPartToBytePositionOrder.Count >= parts.Count)
{ {
break; break;
} }
} }
// have to reverse order so that later XRefs will overwrite previous ones // have to reverse order so that later XRefs will overwrite previous ones
xrefSeqBytePos.Reverse(); xrefPartToBytePositionOrder.Reverse();
} }
// merge used and sorted XRef/trailer // merge used and sorted XRef/trailer
foreach (long bPos in xrefSeqBytePos) foreach (long bPos in xrefPartToBytePositionOrder)
{ {
var currentObject = parts.First(x => x.Offset == bPos || x.Offset == bPos + offsetCorrection); var currentObject = parts.First(x => x.Offset == bPos || x.Offset == bPos + offsetCorrection);
if (currentObject.Dictionary != null) if (currentObject.Dictionary != null)

View File

@@ -33,13 +33,24 @@
public CrossReferenceType Type { get; } public CrossReferenceType Type { get; }
public CrossReferenceTablePart(IReadOnlyDictionary<IndirectReference, long> objectOffsets, long offset, long previous, DictionaryToken dictionary, CrossReferenceType type) /// <summary>
/// For Xref streams indicated by tables they should be used together when constructing the final table.
/// </summary>
public long? TiedToXrefAtOffset { get; }
public CrossReferenceTablePart(
IReadOnlyDictionary<IndirectReference, long> objectOffsets,
long offset, long previous,
DictionaryToken dictionary,
CrossReferenceType type,
long? tiedToXrefAtOffset)
{ {
ObjectOffsets = objectOffsets; ObjectOffsets = objectOffsets;
Offset = offset; Offset = offset;
Previous = previous; Previous = previous;
Dictionary = dictionary; Dictionary = dictionary;
Type = type; Type = type;
TiedToXrefAtOffset = tiedToXrefAtOffset;
} }
public void FixOffset(long offset) public void FixOffset(long offset)

View File

@@ -16,6 +16,8 @@
public CrossReferenceType XRefType { get; set; } public CrossReferenceType XRefType { get; set; }
public long? TiedToPreviousAtOffset { get; set; }
public void Add(long objectId, int generationNumber, long offset) public void Add(long objectId, int generationNumber, long offset)
{ {
IndirectReference objKey = new IndirectReference(objectId, generationNumber); IndirectReference objKey = new IndirectReference(objectId, generationNumber);
@@ -28,7 +30,7 @@
public CrossReferenceTablePart Build() public CrossReferenceTablePart Build()
{ {
return new CrossReferenceTablePart(objects, Offset, Previous, Dictionary, XRefType); return new CrossReferenceTablePart(objects, Offset, Previous, Dictionary, XRefType, TiedToPreviousAtOffset);
} }
} }
} }

View File

@@ -82,9 +82,11 @@
// check for a XRef stream, it may contain some object ids of compressed objects // check for a XRef stream, it may contain some object ids of compressed objects
if (tableDictionary.ContainsKey(NameToken.XrefStm)) if (tableDictionary.ContainsKey(NameToken.XrefStm))
{ {
log.Debug("Cross reference table contained referenced to stream. Reading the stream."); log.Debug("Cross reference table contained reference to stream. Reading the stream.");
int streamOffset = ((NumericToken)tableDictionary.Data[NameToken.XrefStm]).Int; var tiedToTableAtOffset = tablePart.Offset;
int streamOffset = ((NumericToken) tableDictionary.Data[NameToken.XrefStm]).Int;
// check the xref stream reference // check the xref stream reference
fixedOffset = offsetValidator.CheckXRefOffset(streamOffset, tokenScanner, bytes, isLenientParsing); fixedOffset = offsetValidator.CheckXRefOffset(streamOffset, tokenScanner, bytes, isLenientParsing);
@@ -96,8 +98,13 @@
// Update the cross reference table to be a stream instead. // Update the cross reference table to be a stream instead.
tableDictionary = tableDictionary.With(NameToken.XrefStm, new NumericToken(streamOffset)); tableDictionary = tableDictionary.With(NameToken.XrefStm, new NumericToken(streamOffset));
tablePart = new CrossReferenceTablePart(tablePart.ObjectOffsets, streamOffset, tablePart = new CrossReferenceTablePart(
tablePart.Previous, tableDictionary, tablePart.Type); tablePart.ObjectOffsets,
streamOffset,
tablePart.Previous,
tableDictionary,
tablePart.Type,
tiedToTableAtOffset);
} }
// Read the stream from the table. // Read the stream from the table.
@@ -105,7 +112,7 @@
{ {
try try
{ {
TryParseCrossReferenceStream(streamOffset, pdfScanner, out streamPart); TryParseCrossReferenceStream(streamOffset, pdfScanner, tiedToTableAtOffset, out streamPart);
} }
catch (InvalidOperationException ex) catch (InvalidOperationException ex)
{ {
@@ -149,7 +156,7 @@
tokenScanner.Seek(previousCrossReferenceLocation); tokenScanner.Seek(previousCrossReferenceLocation);
// parse xref stream // parse xref stream
if (!TryParseCrossReferenceStream(previousCrossReferenceLocation, pdfScanner, out var tablePart)) if (!TryParseCrossReferenceStream(previousCrossReferenceLocation, pdfScanner, null, out var tablePart))
{ {
if (!TryBruteForceXrefTableLocate(bytes, previousCrossReferenceLocation, out var actualOffset)) if (!TryBruteForceXrefTableLocate(bytes, previousCrossReferenceLocation, out var actualOffset))
{ {
@@ -218,7 +225,10 @@
return resolved; return resolved;
} }
private bool TryParseCrossReferenceStream(long objByteOffset, IPdfTokenScanner pdfScanner, private bool TryParseCrossReferenceStream(
long objByteOffset,
IPdfTokenScanner pdfScanner,
long? fromTableAtOffset,
out CrossReferenceTablePart xrefTablePart) out CrossReferenceTablePart xrefTablePart)
{ {
xrefTablePart = null; xrefTablePart = null;
@@ -236,7 +246,7 @@
return false; return false;
} }
xrefTablePart = crossReferenceStreamParser.Parse(objByteOffset, objectStream); xrefTablePart = crossReferenceStreamParser.Parse(objByteOffset, fromTableAtOffset, objectStream);
return true; return true;
} }

View File

@@ -19,7 +19,7 @@
/// <summary> /// <summary>
/// Parses through the unfiltered stream and populates the xrefTable HashMap. /// Parses through the unfiltered stream and populates the xrefTable HashMap.
/// </summary> /// </summary>
public CrossReferenceTablePart Parse(long streamOffset, StreamToken stream) public CrossReferenceTablePart Parse(long streamOffset, long? fromTableAtOffset, StreamToken stream)
{ {
var decoded = stream.Decode(filterProvider); var decoded = stream.Decode(filterProvider);
@@ -38,7 +38,8 @@
Offset = streamOffset, Offset = streamOffset,
Previous = previousOffset, Previous = previousOffset,
Dictionary = stream.StreamDictionary, Dictionary = stream.StreamDictionary,
XRefType = CrossReferenceType.Stream XRefType = CrossReferenceType.Stream,
TiedToPreviousAtOffset = fromTableAtOffset
}; };
var objectNumbers = GetObjectNumbers(stream.StreamDictionary); var objectNumbers = GetObjectNumbers(stream.StreamDictionary);