Files
PdfPig/src/UglyToad.PdfPig/Parser/Parts/BruteForceSearcher.cs

200 lines
6.3 KiB
C#
Raw Normal View History

namespace UglyToad.PdfPig.Parser.Parts
{
using System;
using System.Collections.Generic;
using System.Globalization;
2018-01-21 18:08:00 +00:00
using System.Text;
using Exceptions;
using IO;
using Util.JetBrains.Annotations;
/// <summary>
2018-01-21 18:08:00 +00:00
/// Store the results of a brute force search for all objects in the document so we only do it once.
/// </summary>
internal class BruteForceSearcher
{
private const int MinimumSearchOffset = 6;
2018-01-21 18:08:00 +00:00
private readonly IInputBytes bytes;
private Dictionary<IndirectReference, long> objectLocations;
2018-01-21 18:08:00 +00:00
public BruteForceSearcher([NotNull] IInputBytes bytes)
{
2018-01-21 18:08:00 +00:00
this.bytes = bytes ?? throw new ArgumentNullException(nameof(bytes));
}
[NotNull]
public IReadOnlyDictionary<IndirectReference, long> GetObjectLocations()
{
if (objectLocations != null)
{
return objectLocations;
}
var loopProtection = 0;
var lastEndOfFile = GetLastEndOfFileMarker();
var results = new Dictionary<IndirectReference, long>();
2018-01-21 18:08:00 +00:00
var originPosition = bytes.CurrentOffset;
long currentOffset = MinimumSearchOffset;
long lastObjectId = long.MinValue;
int lastGenerationId = int.MinValue;
long lastObjOffset = long.MinValue;
2018-01-21 18:08:00 +00:00
bool inObject = false;
bool endobjFound = false;
do
{
if (loopProtection >= 700_000)
{
}
if (loopProtection > 1_000_000)
{
throw new PdfDocumentFormatException("Failed to brute-force search the file due to an infinite loop.");
}
loopProtection++;
2018-01-21 18:08:00 +00:00
if (inObject)
{
if (bytes.CurrentByte == 'e')
2018-01-21 18:08:00 +00:00
{
var next = bytes.Peek();
if (next.HasValue && next == 'n')
{
if (ReadHelper.IsString(bytes, "endobj"))
{
inObject = false;
endobjFound = true;
loopProtection = 0;
for (int i = 0; i < "endobj".Length; i++)
{
bytes.MoveNext();
currentOffset++;
}
}
else
{
bytes.MoveNext();
currentOffset++;
}
}
else
{
bytes.MoveNext();
currentOffset++;
}
2018-01-21 18:08:00 +00:00
}
else
{
bytes.MoveNext();
2018-01-21 18:08:00 +00:00
currentOffset++;
loopProtection = 0;
}
2018-01-21 18:08:00 +00:00
continue;
}
bytes.Seek(currentOffset);
2018-01-21 18:08:00 +00:00
if (!ReadHelper.IsString(bytes, " obj"))
{
currentOffset++;
continue;
}
// Current byte is ' '[obj]
var offset = currentOffset - 1;
bytes.Seek(offset);
var generationBytes = new StringBuilder();
while (ReadHelper.IsDigit(bytes.CurrentByte) && offset >= MinimumSearchOffset)
{
generationBytes.Insert(0, (char)bytes.CurrentByte);
offset--;
bytes.Seek(offset);
}
// We should now be at the space between object and generation number.
if (!ReadHelper.IsSpace(bytes.CurrentByte))
{
continue;
}
2018-01-21 18:08:00 +00:00
bytes.Seek(--offset);
var objectNumberBytes = new StringBuilder();
while (ReadHelper.IsDigit(bytes.CurrentByte) && offset >= MinimumSearchOffset)
{
objectNumberBytes.Insert(0, (char)bytes.CurrentByte);
offset--;
bytes.Seek(offset);
}
var obj = long.Parse(objectNumberBytes.ToString(), CultureInfo.InvariantCulture);
var generation = int.Parse(generationBytes.ToString(), CultureInfo.InvariantCulture);
2018-01-21 18:08:00 +00:00
results[new IndirectReference(obj, generation)] = bytes.CurrentOffset + 1;
inObject = true;
endobjFound = false;
currentOffset++;
bytes.Seek(currentOffset);
loopProtection = 0;
2018-01-21 18:08:00 +00:00
} while (currentOffset < lastEndOfFile && !bytes.IsAtEnd());
if ((lastEndOfFile < long.MaxValue || endobjFound) && lastObjOffset > 0)
{
// if the pdf wasn't cut off in the middle or if the last object ends with a "endobj" marker
// the last object id has to be added here so that it can't get lost as there isn't any subsequent object id
results[new IndirectReference(lastObjectId, lastGenerationId)] = lastObjOffset;
}
// reestablish origin position
2018-01-21 18:08:00 +00:00
bytes.Seek(originPosition);
objectLocations = results;
return objectLocations;
}
private long GetLastEndOfFileMarker()
{
2018-01-21 18:08:00 +00:00
var originalOffset = bytes.CurrentOffset;
2018-01-21 18:08:00 +00:00
const string searchTerm = "%%EOF";
2018-01-21 18:08:00 +00:00
var minimumEndOffset = bytes.Length - searchTerm.Length;
2018-01-21 18:08:00 +00:00
bytes.Seek(minimumEndOffset);
2018-01-21 18:08:00 +00:00
while (bytes.CurrentOffset > 0)
{
2018-01-21 18:08:00 +00:00
if (ReadHelper.IsString(bytes, searchTerm))
{
2018-01-21 18:08:00 +00:00
var position = bytes.CurrentOffset;
bytes.Seek(originalOffset);
return position;
}
2018-01-21 18:08:00 +00:00
bytes.Seek(minimumEndOffset--);
}
2018-01-21 18:08:00 +00:00
bytes.Seek(originalOffset);
return long.MaxValue;
}
}
}