namespace UglyToad.PdfPig.Parser.FileStructure; using Core; using Filters; using Logging; using System.Linq; using Tokenization.Scanner; using Tokens; using Util; internal static class XrefStreamParser { public static XrefStream? TryReadStreamAtOffset( FileHeaderOffset fileHeaderOffset, long xrefOffset, IInputBytes bytes, ISeekableTokenScanner scanner, ILog log) { if (xrefOffset >= bytes.Length || xrefOffset < 0) { return null; } var offsetCorrectionType = XrefOffsetCorrection.None; var offsetCorrection = 0L; bytes.Seek(xrefOffset); if (!TryReadStreamObjAt(xrefOffset, scanner, out var dictToken) || dictToken == null) { log.Debug($"Did not find the stream at {xrefOffset} attempting correction"); var recovered = TryRecoverOffset(fileHeaderOffset, xrefOffset, scanner); if (recovered == null || !TryReadStreamObjAt(recovered.Value.correctOffset, scanner, out var streamDict) || streamDict == null) { return null; } dictToken = streamDict; offsetCorrection = recovered.Value.correctOffset - xrefOffset; offsetCorrectionType = recovered.Value.correctionType; xrefOffset = recovered.Value.correctOffset; } if (!dictToken.TryGet(NameToken.Type, out NameToken dictType) || dictType != NameToken.Xref) { return null; } if (!dictToken.TryGet(NameToken.W, out ArrayToken dictArray)) { return null; } try { var streamData = ReadStreamTolerant(bytes); if (!streamData.to.HasValue) { return null; } var dataLen = streamData.to.Value - streamData.from; if (dataLen <= 0) { return null; } bytes.Seek(streamData.from); var data = new byte[dataLen]; var readCount = bytes.Read(data); if (readCount != dataLen) { return null; } var stream = new StreamToken(dictToken, data); var decoded = stream.Decode(DefaultFilterProvider.Instance).Span; var fieldSizes = new XrefFieldSize(dictArray); var lineCount = decoded.Length / fieldSizes.LineLength; var objectNumbers = GetObjectNumbers(dictToken); var lineNumber = 0; Span lineBuffer = fieldSizes.LineLength <= 1024 ? stackalloc byte[fieldSizes.LineLength] : new byte[fieldSizes.LineLength]; var numbers = new List<(long obj, int gen, XrefLocation location)>(); foreach (var objectNumber in objectNumbers) { if (lineNumber >= lineCount) { break; } var byteOffset = lineNumber * fieldSizes.LineLength; for (var i = 0; i < fieldSizes.LineLength; i++) { lineBuffer[i] = decoded[byteOffset + i]; } int type; if (fieldSizes.Field1Size == 0) { type = 1; } else { type = 0; for (var i = 0; i < fieldSizes.Field1Size; i++) { type += (lineBuffer[i] & 0x00ff) << ((fieldSizes.Field1Size - i - 1) * 8); } } ReadNextStreamObject(type, objectNumber, fieldSizes, numbers, lineBuffer); lineNumber++; } return new XrefStream( xrefOffset, numbers.ToDictionary(x => new IndirectReference(x.obj, x.gen), x => x.location), dictToken, offsetCorrectionType, offsetCorrection); } catch (Exception ex) { log.Error($"Failed to parse the XRef stream at {xrefOffset}", ex); return null; } } /// /// The provided offset can frequently be close but not quite correct. /// The 2 most common failure modes are that the PDF content starts at some /// non-zero offset in the file so all content is shifted by bytes /// or we're within a few bytes of the offset but not directly at it. /// private static (long correctOffset, XrefOffsetCorrection correctionType)? TryRecoverOffset( FileHeaderOffset fileHeaderOffset, long xrefOffset, ISeekableTokenScanner scanner) { // If the %PDF- version header appears at some offset in the file then treat everything as shifted. if (fileHeaderOffset.Value > 0) { if (TryReadStreamObjAt(xrefOffset + fileHeaderOffset.Value, scanner, out _)) { return (xrefOffset + fileHeaderOffset.Value, XrefOffsetCorrection.FileHeaderOffset); } } return null; } private static void ReadNextStreamObject( int type, long objectNumber, XrefFieldSize fieldSizes, List<(long, int, XrefLocation)> results, ReadOnlySpan lineBuffer) { switch (type) { case 0: // Ignore free objects. break; case 1: var offset = ReadUnsigned( lineBuffer, fieldSizes.Field1Size, fieldSizes.Field2Size); var genNum = ReadUnsigned( lineBuffer, fieldSizes.Field1Size + fieldSizes.Field2Size, fieldSizes.Field3Size); if (offset < 0) { throw new PdfDocumentFormatException( $"Location with negative offset {offset} found for object {objectNumber}"); } results.Add((objectNumber, (int)genNum, XrefLocation.File(offset))); break; case 2: /* * object stored in object stream: * 2nd argument is object number of object stream * 3rd argument is index of object within object stream * */ var objectStreamNumber = ReadUnsigned( lineBuffer, fieldSizes.Field1Size, fieldSizes.Field2Size); var streamIndex = ReadUnsigned( lineBuffer, fieldSizes.Field1Size + fieldSizes.Field2Size, fieldSizes.Field3Size); if (objectStreamNumber < 0) { throw new PdfDocumentFormatException( $"Location with negative or zero object stream number {objectStreamNumber} found for object {objectNumber}"); } if (streamIndex < 0) { throw new PdfDocumentFormatException( $"Location with negative stream index {streamIndex} found for object {objectNumber} in stream {objectStreamNumber}"); } results.Add((objectNumber, 0, XrefLocation.Stream(objectStreamNumber, (int)streamIndex))); break; } } private static long ReadUnsigned(ReadOnlySpan buffer, int start, int width) { long value = 0; for (int i = 0; i < width; i++) { value <<= 8; value |= buffer[start + i]; } return value; } private static (long from, long? to) ReadStreamTolerant(IInputBytes bytes) { var buffer = new CircularByteBuffer("endstream ".Length); var startMarker = bytes.CurrentOffset; long? endMarker = null; while (bytes.CurrentByte == '>' && bytes.MoveNext()) { } bool IsStreamWhitespace() { return bytes.CurrentByte == (byte)' ' || bytes.CurrentByte == (byte)'\r' || bytes.CurrentByte == (byte)'\n'; } var isWhitespaceActive = IsStreamWhitespace(); do { // Normalize whitespace. if (IsStreamWhitespace()) { buffer.Add((byte)' '); if (isWhitespaceActive) { startMarker = bytes.CurrentOffset; } } else { buffer.Add(bytes.CurrentByte); isWhitespaceActive = false; } if (buffer.EndsWith("endstream ")) { endMarker = bytes.CurrentOffset - "endstream ".Length; break; } if (buffer.EndsWith("stream ")) { startMarker = bytes.CurrentOffset; isWhitespaceActive = IsStreamWhitespace(); } else if (buffer.EndsWith("endobj ")) { endMarker = bytes.CurrentOffset - "endobj ".Length; break; } } while (bytes.MoveNext()); return (startMarker, endMarker); } private static ReadOnlySpan GetObjectNumbers(DictionaryToken dictionary) { // The number one greater than the highest object number used in this section or in any section for which this is an update. if (!dictionary.TryGet(NameToken.Size, out var sizeToken) || !(sizeToken is NumericToken sizeNumeric)) { throw new PdfDocumentFormatException($"The stream dictionary must contain a numeric size value: {dictionary}."); } var objNums = new List(); if (dictionary.TryGet(NameToken.Index, out var indexToken) && indexToken is ArrayToken indexArrayToken) { // An array containing a pair of integers for each subsection in this section. // Pair[0] is the first object number in the subsection; Pair[1] is the number of entries in the subsection. for (var i = 0; i < indexArrayToken.Length; i += 2) { var firstObjectNumber = indexArrayToken.GetNumeric(i).Int; var size = indexArrayToken.GetNumeric(i + 1).Int; for (var j = 0; j < size; j++) { objNums.Add(firstObjectNumber + j); } } } else { for (var i = 0; i < sizeNumeric.Int; i++) { objNums.Add(i); } } #if NET return System.Runtime.InteropServices.CollectionsMarshal.AsSpan(objNums); #else return objNums.ToArray(); #endif } private static bool TryReadStreamObjAt(long offset, ISeekableTokenScanner scanner, out DictionaryToken? dictionary) { dictionary = null; scanner.Seek(offset); if (scanner.TryReadToken(out NumericToken _) && scanner.TryReadToken(out NumericToken _) && scanner.TryReadToken(out OperatorToken opToken) && ReferenceEquals(opToken, OperatorToken.StartObject) && scanner.TryReadToken(out DictionaryToken dictToken)) { dictionary = dictToken; return true; } return false; } /// /// The array representing the size of the fields in a cross reference stream. /// private class XrefFieldSize { /// /// The type of the entry. /// public int Field1Size { get; } /// /// Type 0 and 2 is the object number, Type 1 this is the byte offset from beginning of file. /// public int Field2Size { get; } /// /// For types 0 and 1 this is the generation number. For type 2 it is the stream index. /// public int Field3Size { get; } /// /// How many bytes are in a line. /// public int LineLength { get; } public XrefFieldSize(ArrayToken wArray) { if (wArray.Data.Count < 3) { throw new PdfDocumentFormatException($"There must be at least 3 entries in a W entry for a stream dictionary: {wArray}."); } Field1Size = wArray.GetNumeric(0).Int; Field2Size = wArray.GetNumeric(1).Int; Field3Size = wArray.GetNumeric(2).Int; LineLength = Field1Size + Field2Size + Field3Size; } } }