PdfPig/src/UglyToad.PdfPig/Parser/Parts/CrossReference/CrossReferenceStreamParser.cs

175 lines
6.5 KiB
C#
Raw Normal View History

namespace UglyToad.PdfPig.Parser.Parts.CrossReference
{
using System.Collections.Generic;
using Core;
using Filters;
using PdfPig.CrossReference;
using Tokens;
2018-01-21 02:42:29 +08:00
using Util;
internal class CrossReferenceStreamParser
{
private readonly IFilterProvider filterProvider;
public CrossReferenceStreamParser(IFilterProvider filterProvider)
{
this.filterProvider = filterProvider;
}
2018-01-21 02:42:29 +08:00
/// <summary>
/// Parses through the unfiltered stream and populates the xrefTable HashMap.
/// </summary>
public CrossReferenceTablePart Parse(long streamOffset, long? fromTableAtOffset, StreamToken stream)
{
2024-04-02 07:04:54 +08:00
var decoded = stream.Decode(filterProvider).Span;
2018-01-21 02:42:29 +08:00
var fieldSizes = new CrossReferenceStreamFieldSize(stream.StreamDictionary);
2024-04-02 07:04:54 +08:00
var lineCount = decoded.Length / fieldSizes.LineLength;
2018-01-21 02:42:29 +08:00
long previousOffset = -1;
if (stream.StreamDictionary.TryGet(NameToken.Prev, out var prevToken) && prevToken is NumericToken prevNumeric)
{
previousOffset = prevNumeric.Long;
}
var builder = new CrossReferenceTablePartBuilder
{
Offset = streamOffset,
2018-01-21 02:42:29 +08:00
Previous = previousOffset,
Dictionary = stream.StreamDictionary,
XRefType = CrossReferenceType.Stream,
TiedToPreviousAtOffset = fromTableAtOffset
};
2018-01-21 02:42:29 +08:00
var objectNumbers = GetObjectNumbers(stream.StreamDictionary);
var lineNumber = 0;
var lineBuffer = new byte[fieldSizes.LineLength];
foreach (var objectNumber in objectNumbers)
{
2018-01-21 02:42:29 +08:00
if (lineNumber >= lineCount)
{
break;
}
var byteOffset = lineNumber * fieldSizes.LineLength;
for (var i = 0; i < fieldSizes.LineLength; i++)
{
lineBuffer[i] = decoded[byteOffset + i];
}
2018-01-21 02:42:29 +08:00
int type;
if (fieldSizes.Field1Size == 0)
{
2018-01-21 02:42:29 +08:00
type = 1;
}
else
{
type = 0;
for (var i = 0; i < fieldSizes.Field1Size; i++)
{
2018-01-21 02:42:29 +08:00
type += (lineBuffer[i] & 0x00ff) << ((fieldSizes.Field1Size - i - 1) * 8);
}
2018-01-21 02:42:29 +08:00
}
ReadNextStreamObject(type, objectNumber, fieldSizes, builder, lineBuffer);
2018-01-21 02:42:29 +08:00
lineNumber++;
}
return builder.Build();
}
private static void ReadNextStreamObject(int type, long objectNumber, CrossReferenceStreamFieldSize fieldSizes,
CrossReferenceTablePartBuilder builder, byte[] lineBuffer)
{
switch (type)
{
case 0:
// Ignore free objects.
break;
case 1:
// Non object stream entries.
var offset = 0;
for (var i = 0; i < fieldSizes.Field2Size; i++)
{
2018-01-21 02:42:29 +08:00
offset += (lineBuffer[i + fieldSizes.Field1Size] & 0x00ff) << ((fieldSizes.Field2Size - i - 1) * 8);
}
var genNum = 0;
for (var i = 0; i < fieldSizes.Field3Size; i++)
{
2018-01-21 02:42:29 +08:00
genNum += (lineBuffer[i + fieldSizes.Field1Size + fieldSizes.Field2Size] & 0x00ff) << ((fieldSizes.Field3Size - i - 1) * 8);
}
2018-01-21 02:42:29 +08:00
builder.Add(objectNumber, genNum, offset);
break;
case 2:
/*
2018-01-21 02:42:29 +08:00
* object stored in object stream:
* 2nd argument is object number of object stream
* 3rd argument is index of object within object stream
*
* For sequential PDFParser we do not need this information
* because
* These objects are handled by the dereferenceObjects() method
* since they're only pointing to object numbers
*
* However for XRef aware parsers we have to know which objects contain
* object streams. We will store this information in normal xref mapping
* table but add object stream number with minus sign in order to
* distinguish from file offsets
*/
var objstmObjNr = 0;
for (var i = 0; i < fieldSizes.Field2Size; i++)
{
2018-01-21 02:42:29 +08:00
objstmObjNr += (lineBuffer[i + fieldSizes.Field1Size] & 0x00ff) << ((fieldSizes.Field2Size - i - 1) * 8);
}
2018-01-21 02:42:29 +08:00
builder.Add(objectNumber, 0, -objstmObjNr);
2018-01-21 02:42:29 +08:00
break;
}
}
private static IEnumerable<long> GetObjectNumbers(DictionaryToken dictionary)
{
// The number one greater than the highest object number used in this section or in any section for which this is an update.
2018-01-21 02:42:29 +08:00
if (!dictionary.TryGet(NameToken.Size, out var sizeToken) || !(sizeToken is NumericToken sizeNumeric))
{
2018-01-21 02:42:29 +08:00
throw new PdfDocumentFormatException($"The stream dictionary must contain a numeric size value: {dictionary}.");
}
var objNums = new List<long>();
2018-01-21 02:42:29 +08:00
if (dictionary.TryGet(NameToken.Index, out var indexToken) && indexToken is ArrayToken indexArrayToken)
{
// An array containing a pair of integers for each subsection in this section.
// Pair[0] is the first object number in the subsection; Pair[1] is the number of entries in the subsection.
for (var i = 0; i < indexArrayToken.Length; i += 2)
2018-01-21 02:42:29 +08:00
{
var firstObjectNumber = indexArrayToken.GetNumeric(i).Int;
var size = indexArrayToken.GetNumeric(i + 1).Int;
for (var j = 0; j < size; j++)
{
objNums.Add(firstObjectNumber + j);
}
}
}
else
{
for (var i = 0; i < sizeNumeric.Int; i++)
{
objNums.Add(i);
}
}
return objNums;
}
}
}