PdfPig/src/UglyToad.PdfPig/Parser/Parts/CrossReference/CrossReferenceStreamParser.cs

169 lines
6.0 KiB
C#
Raw Normal View History

namespace UglyToad.PdfPig.Parser.Parts.CrossReference
{
using System.Collections.Generic;
2018-01-21 02:42:29 +08:00
using Exceptions;
using Filters;
using PdfPig.CrossReference;
using Tokens;
2018-01-21 02:42:29 +08:00
using Util;
internal class CrossReferenceStreamParser
{
private readonly IFilterProvider filterProvider;
public CrossReferenceStreamParser(IFilterProvider filterProvider)
{
this.filterProvider = filterProvider;
}
2018-01-21 02:42:29 +08:00
/// <summary>
/// Parses through the unfiltered stream and populates the xrefTable HashMap.
/// </summary>
2018-01-21 02:42:29 +08:00
public CrossReferenceTablePart Parse(long streamOffset, StreamToken stream)
{
var decoded = stream.Decode(filterProvider);
2018-01-21 02:42:29 +08:00
var fieldSizes = new CrossReferenceStreamFieldSize(stream.StreamDictionary);
var lineCount = decoded.Count / fieldSizes.LineLength;
2018-01-21 02:42:29 +08:00
long previousOffset = -1;
if (stream.StreamDictionary.TryGet(NameToken.Prev, out var prevToken) && prevToken is NumericToken prevNumeric)
{
previousOffset = prevNumeric.Long;
}
var builder = new CrossReferenceTablePartBuilder
{
Offset = streamOffset,
2018-01-21 02:42:29 +08:00
Previous = previousOffset,
Dictionary = stream.StreamDictionary,
XRefType = CrossReferenceType.Stream
};
2018-01-21 02:42:29 +08:00
var objectNumbers = GetObjectNumbers(stream.StreamDictionary);
var lineNumber = 0;
var lineBuffer = new byte[fieldSizes.LineLength];
foreach (var objectNumber in objectNumbers)
{
2018-01-21 02:42:29 +08:00
if (lineNumber >= lineCount)
{
break;
}
var byteOffset = lineNumber * fieldSizes.LineLength;
for (var i = 0; i < fieldSizes.LineLength; i++)
{
lineBuffer[i] = decoded[byteOffset + i];
}
2018-01-21 02:42:29 +08:00
int type;
if (fieldSizes.Field1Size == 0)
{
2018-01-21 02:42:29 +08:00
type = 1;
}
else
{
type = 0;
for (var i = 0; i < fieldSizes.Field1Size; i++)
{
2018-01-21 02:42:29 +08:00
type += (lineBuffer[i] & 0x00ff) << ((fieldSizes.Field1Size - i - 1) * 8);
}
2018-01-21 02:42:29 +08:00
}
ReadNextStreamObject(type, objectNumber, fieldSizes, builder, lineBuffer);
2018-01-21 02:42:29 +08:00
lineNumber++;
}
return builder.Build();
}
private static void ReadNextStreamObject(int type, long objectNumber, CrossReferenceStreamFieldSize fieldSizes,
CrossReferenceTablePartBuilder builder, byte[] lineBuffer)
{
switch (type)
{
case 0:
// Ignore free objects.
break;
case 1:
// Non object stream entries.
int offset = 0;
for (int i = 0; i < fieldSizes.Field2Size; i++)
{
2018-01-21 02:42:29 +08:00
offset += (lineBuffer[i + fieldSizes.Field1Size] & 0x00ff) << ((fieldSizes.Field2Size - i - 1) * 8);
}
2018-01-21 02:42:29 +08:00
int genNum = 0;
for (int i = 0; i < fieldSizes.Field3Size; i++)
{
2018-01-21 02:42:29 +08:00
genNum += (lineBuffer[i + fieldSizes.Field1Size + fieldSizes.Field2Size] & 0x00ff) << ((fieldSizes.Field3Size - i - 1) * 8);
}
2018-01-21 02:42:29 +08:00
builder.Add(objectNumber, genNum, offset);
break;
case 2:
/*
2018-01-21 02:42:29 +08:00
* object stored in object stream:
* 2nd argument is object number of object stream
* 3rd argument is index of object within object stream
*
* For sequential PDFParser we do not need this information
* because
* These objects are handled by the dereferenceObjects() method
* since they're only pointing to object numbers
*
* However for XRef aware parsers we have to know which objects contain
* object streams. We will store this information in normal xref mapping
* table but add object stream number with minus sign in order to
* distinguish from file offsets
*/
2018-01-21 02:42:29 +08:00
int objstmObjNr = 0;
for (int i = 0; i < fieldSizes.Field2Size; i++)
{
2018-01-21 02:42:29 +08:00
objstmObjNr += (lineBuffer[i + fieldSizes.Field1Size] & 0x00ff) << ((fieldSizes.Field2Size - i - 1) * 8);
}
2018-01-21 02:42:29 +08:00
builder.Add(objectNumber, 0, -objstmObjNr);
2018-01-21 02:42:29 +08:00
break;
}
}
2018-01-21 02:42:29 +08:00
private static List<long> GetObjectNumbers(DictionaryToken dictionary)
{
2018-01-21 02:42:29 +08:00
if (!dictionary.TryGet(NameToken.Size, out var sizeToken) || !(sizeToken is NumericToken sizeNumeric))
{
2018-01-21 02:42:29 +08:00
throw new PdfDocumentFormatException($"The stream dictionary must contain a numeric size value: {dictionary}.");
}
var indexArray = new[] { 0, sizeNumeric.Int };
if (dictionary.TryGet(NameToken.Index, out var indexToken) && indexToken is ArrayToken indexArrayToken)
{
indexArray = new[]
{
indexArrayToken.GetNumeric(0).Int,
indexArrayToken.GetNumeric(1).Int
};
}
List<long> objNums = new List<long>();
2018-01-21 02:42:29 +08:00
var firstObjectNumber = indexArray[0];
var size = indexArray[1];
2018-01-21 02:42:29 +08:00
for (var i = 0; i < size; i++)
{
2018-01-21 02:42:29 +08:00
objNums.Add(firstObjectNumber + i);
}
return objNums;
}
}
}