namespace UglyToad.PdfPig.Parser.Parts { using System; using System.Collections.Generic; using System.Globalization; using System.Text; using Core; using Util.JetBrains.Annotations; /// /// Store the results of a brute force search for all objects in the document so we only do it once. /// internal class BruteForceSearcher { private const int MinimumSearchOffset = 6; private readonly IInputBytes bytes; private Dictionary objectLocations; public BruteForceSearcher([NotNull] IInputBytes bytes) { this.bytes = bytes ?? throw new ArgumentNullException(nameof(bytes)); } [NotNull] public IReadOnlyDictionary GetObjectLocations() { if (objectLocations != null) { return objectLocations; } var loopProtection = 0; var lastEndOfFile = GetLastEndOfFileMarker(); var results = new Dictionary(); var originPosition = bytes.CurrentOffset; long currentOffset = MinimumSearchOffset; long lastObjectId = long.MinValue; int lastGenerationId = int.MinValue; long lastObjOffset = long.MinValue; bool inObject = false; bool endobjFound = false; do { if (loopProtection > 1_000_000) { throw new PdfDocumentFormatException("Failed to brute-force search the file due to an infinite loop."); } loopProtection++; if (inObject) { if (bytes.CurrentByte == 'e') { var next = bytes.Peek(); if (next.HasValue && next == 'n') { if (ReadHelper.IsString(bytes, "endobj")) { inObject = false; endobjFound = true; loopProtection = 0; for (int i = 0; i < "endobj".Length; i++) { bytes.MoveNext(); currentOffset++; } } else { bytes.MoveNext(); currentOffset++; } } else { bytes.MoveNext(); currentOffset++; } } else { bytes.MoveNext(); currentOffset++; loopProtection = 0; } continue; } bytes.Seek(currentOffset); if (!ReadHelper.IsString(bytes, " obj")) { currentOffset++; continue; } // Current byte is ' '[obj] var offset = currentOffset - 1; bytes.Seek(offset); var generationBytes = new StringBuilder(); while (ReadHelper.IsDigit(bytes.CurrentByte) && offset >= MinimumSearchOffset) { generationBytes.Insert(0, (char)bytes.CurrentByte); offset--; bytes.Seek(offset); } // We should now be at the space between object and generation number. if (!ReadHelper.IsSpace(bytes.CurrentByte)) { continue; } bytes.Seek(--offset); var objectNumberBytes = new StringBuilder(); while (ReadHelper.IsDigit(bytes.CurrentByte) && offset >= MinimumSearchOffset) { objectNumberBytes.Insert(0, (char)bytes.CurrentByte); offset--; bytes.Seek(offset); } var obj = long.Parse(objectNumberBytes.ToString(), CultureInfo.InvariantCulture); var generation = int.Parse(generationBytes.ToString(), CultureInfo.InvariantCulture); results[new IndirectReference(obj, generation)] = bytes.CurrentOffset; inObject = true; endobjFound = false; currentOffset++; bytes.Seek(currentOffset); loopProtection = 0; } while (currentOffset < lastEndOfFile && !bytes.IsAtEnd()); if ((lastEndOfFile < long.MaxValue || endobjFound) && lastObjOffset > 0) { // if the pdf wasn't cut off in the middle or if the last object ends with a "endobj" marker // the last object id has to be added here so that it can't get lost as there isn't any subsequent object id results[new IndirectReference(lastObjectId, lastGenerationId)] = lastObjOffset; } // reestablish origin position bytes.Seek(originPosition); objectLocations = results; return objectLocations; } private long GetLastEndOfFileMarker() { var originalOffset = bytes.CurrentOffset; const string searchTerm = "%%EOF"; var minimumEndOffset = bytes.Length - searchTerm.Length; bytes.Seek(minimumEndOffset); while (bytes.CurrentOffset > 0) { if (ReadHelper.IsString(bytes, searchTerm)) { var position = bytes.CurrentOffset; bytes.Seek(originalOffset); return position; } bytes.Seek(minimumEndOffset--); } bytes.Seek(originalOffset); return long.MaxValue; } } }