2018-01-10 19:49:32 +00:00
|
|
|
|
namespace UglyToad.PdfPig.Parser.Parts
|
2017-11-09 19:14:09 +00:00
|
|
|
|
{
|
|
|
|
|
using System;
|
|
|
|
|
using System.Collections.Generic;
|
2019-06-18 19:12:51 +01:00
|
|
|
|
using System.Globalization;
|
2018-01-21 18:08:00 +00:00
|
|
|
|
using System.Text;
|
2020-01-04 16:38:18 +00:00
|
|
|
|
using Core;
|
2017-11-09 19:14:09 +00:00
|
|
|
|
using Util.JetBrains.Annotations;
|
|
|
|
|
|
|
|
|
|
/// <summary>
|
2018-01-21 18:08:00 +00:00
|
|
|
|
/// Store the results of a brute force search for all objects in the document so we only do it once.
|
2017-11-09 19:14:09 +00:00
|
|
|
|
/// </summary>
|
2017-12-31 14:23:36 +00:00
|
|
|
|
internal class BruteForceSearcher
|
2017-11-09 19:14:09 +00:00
|
|
|
|
{
|
|
|
|
|
private const int MinimumSearchOffset = 6;
|
|
|
|
|
|
2018-01-21 18:08:00 +00:00
|
|
|
|
private readonly IInputBytes bytes;
|
2017-11-09 19:14:09 +00:00
|
|
|
|
|
2018-01-14 14:48:54 +00:00
|
|
|
|
private Dictionary<IndirectReference, long> objectLocations;
|
2017-11-09 19:14:09 +00:00
|
|
|
|
|
2018-01-21 18:08:00 +00:00
|
|
|
|
public BruteForceSearcher([NotNull] IInputBytes bytes)
|
2017-11-09 19:14:09 +00:00
|
|
|
|
{
|
2018-01-21 18:08:00 +00:00
|
|
|
|
this.bytes = bytes ?? throw new ArgumentNullException(nameof(bytes));
|
2017-11-09 19:14:09 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
[NotNull]
|
2018-01-14 14:48:54 +00:00
|
|
|
|
public IReadOnlyDictionary<IndirectReference, long> GetObjectLocations()
|
2017-11-09 19:14:09 +00:00
|
|
|
|
{
|
|
|
|
|
if (objectLocations != null)
|
|
|
|
|
{
|
|
|
|
|
return objectLocations;
|
|
|
|
|
}
|
|
|
|
|
|
2019-10-17 16:50:01 +01:00
|
|
|
|
var loopProtection = 0;
|
|
|
|
|
|
2017-11-09 19:14:09 +00:00
|
|
|
|
var lastEndOfFile = GetLastEndOfFileMarker();
|
|
|
|
|
|
2018-01-14 14:48:54 +00:00
|
|
|
|
var results = new Dictionary<IndirectReference, long>();
|
2017-11-09 19:14:09 +00:00
|
|
|
|
|
2018-01-21 18:08:00 +00:00
|
|
|
|
var originPosition = bytes.CurrentOffset;
|
2017-11-09 19:14:09 +00:00
|
|
|
|
|
|
|
|
|
long currentOffset = MinimumSearchOffset;
|
|
|
|
|
long lastObjectId = long.MinValue;
|
|
|
|
|
int lastGenerationId = int.MinValue;
|
|
|
|
|
long lastObjOffset = long.MinValue;
|
|
|
|
|
|
2018-01-21 18:08:00 +00:00
|
|
|
|
bool inObject = false;
|
2017-11-09 19:14:09 +00:00
|
|
|
|
bool endobjFound = false;
|
|
|
|
|
do
|
|
|
|
|
{
|
2019-10-17 16:50:01 +01:00
|
|
|
|
if (loopProtection > 1_000_000)
|
|
|
|
|
{
|
|
|
|
|
throw new PdfDocumentFormatException("Failed to brute-force search the file due to an infinite loop.");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
loopProtection++;
|
|
|
|
|
|
2018-01-21 18:08:00 +00:00
|
|
|
|
if (inObject)
|
|
|
|
|
{
|
2018-01-21 19:34:21 +00:00
|
|
|
|
if (bytes.CurrentByte == 'e')
|
2018-01-21 18:08:00 +00:00
|
|
|
|
{
|
2018-01-21 19:34:21 +00:00
|
|
|
|
var next = bytes.Peek();
|
|
|
|
|
|
|
|
|
|
if (next.HasValue && next == 'n')
|
|
|
|
|
{
|
|
|
|
|
if (ReadHelper.IsString(bytes, "endobj"))
|
|
|
|
|
{
|
|
|
|
|
inObject = false;
|
|
|
|
|
endobjFound = true;
|
2019-12-09 12:24:20 +00:00
|
|
|
|
loopProtection = 0;
|
2018-01-21 19:34:21 +00:00
|
|
|
|
|
|
|
|
|
for (int i = 0; i < "endobj".Length; i++)
|
|
|
|
|
{
|
|
|
|
|
bytes.MoveNext();
|
|
|
|
|
currentOffset++;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
bytes.MoveNext();
|
|
|
|
|
currentOffset++;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
bytes.MoveNext();
|
|
|
|
|
currentOffset++;
|
|
|
|
|
}
|
2018-01-21 18:08:00 +00:00
|
|
|
|
}
|
|
|
|
|
else
|
2017-11-09 19:14:09 +00:00
|
|
|
|
{
|
2018-01-21 19:34:21 +00:00
|
|
|
|
bytes.MoveNext();
|
2018-01-21 18:08:00 +00:00
|
|
|
|
currentOffset++;
|
2019-12-18 12:02:07 +00:00
|
|
|
|
loopProtection = 0;
|
2017-11-09 19:14:09 +00:00
|
|
|
|
}
|
2018-01-21 18:08:00 +00:00
|
|
|
|
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
2018-01-21 19:34:21 +00:00
|
|
|
|
bytes.Seek(currentOffset);
|
|
|
|
|
|
2018-01-21 18:08:00 +00:00
|
|
|
|
if (!ReadHelper.IsString(bytes, " obj"))
|
|
|
|
|
{
|
|
|
|
|
currentOffset++;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Current byte is ' '[obj]
|
|
|
|
|
var offset = currentOffset - 1;
|
|
|
|
|
|
|
|
|
|
bytes.Seek(offset);
|
|
|
|
|
|
|
|
|
|
var generationBytes = new StringBuilder();
|
|
|
|
|
while (ReadHelper.IsDigit(bytes.CurrentByte) && offset >= MinimumSearchOffset)
|
|
|
|
|
{
|
|
|
|
|
generationBytes.Insert(0, (char)bytes.CurrentByte);
|
|
|
|
|
offset--;
|
|
|
|
|
bytes.Seek(offset);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// We should now be at the space between object and generation number.
|
|
|
|
|
if (!ReadHelper.IsSpace(bytes.CurrentByte))
|
|
|
|
|
{
|
|
|
|
|
continue;
|
2017-11-09 19:14:09 +00:00
|
|
|
|
}
|
2018-01-21 18:08:00 +00:00
|
|
|
|
|
|
|
|
|
bytes.Seek(--offset);
|
|
|
|
|
|
|
|
|
|
var objectNumberBytes = new StringBuilder();
|
|
|
|
|
while (ReadHelper.IsDigit(bytes.CurrentByte) && offset >= MinimumSearchOffset)
|
|
|
|
|
{
|
|
|
|
|
objectNumberBytes.Insert(0, (char)bytes.CurrentByte);
|
|
|
|
|
offset--;
|
|
|
|
|
bytes.Seek(offset);
|
|
|
|
|
}
|
|
|
|
|
|
2019-06-18 19:12:51 +01:00
|
|
|
|
var obj = long.Parse(objectNumberBytes.ToString(), CultureInfo.InvariantCulture);
|
|
|
|
|
var generation = int.Parse(generationBytes.ToString(), CultureInfo.InvariantCulture);
|
2018-01-21 18:08:00 +00:00
|
|
|
|
|
2020-02-24 12:24:18 +00:00
|
|
|
|
results[new IndirectReference(obj, generation)] = bytes.CurrentOffset;
|
2018-01-21 18:08:00 +00:00
|
|
|
|
|
|
|
|
|
inObject = true;
|
|
|
|
|
endobjFound = false;
|
|
|
|
|
|
2017-11-09 19:14:09 +00:00
|
|
|
|
currentOffset++;
|
2018-01-21 19:34:21 +00:00
|
|
|
|
|
|
|
|
|
bytes.Seek(currentOffset);
|
2019-10-17 16:50:01 +01:00
|
|
|
|
loopProtection = 0;
|
2018-01-21 18:08:00 +00:00
|
|
|
|
} while (currentOffset < lastEndOfFile && !bytes.IsAtEnd());
|
|
|
|
|
|
2017-11-09 19:14:09 +00:00
|
|
|
|
if ((lastEndOfFile < long.MaxValue || endobjFound) && lastObjOffset > 0)
|
|
|
|
|
{
|
|
|
|
|
// if the pdf wasn't cut off in the middle or if the last object ends with a "endobj" marker
|
|
|
|
|
// the last object id has to be added here so that it can't get lost as there isn't any subsequent object id
|
2018-01-14 14:48:54 +00:00
|
|
|
|
results[new IndirectReference(lastObjectId, lastGenerationId)] = lastObjOffset;
|
2017-11-09 19:14:09 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// reestablish origin position
|
2018-01-21 18:08:00 +00:00
|
|
|
|
bytes.Seek(originPosition);
|
2017-11-09 19:14:09 +00:00
|
|
|
|
|
|
|
|
|
objectLocations = results;
|
|
|
|
|
|
|
|
|
|
return objectLocations;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private long GetLastEndOfFileMarker()
|
|
|
|
|
{
|
2018-01-21 18:08:00 +00:00
|
|
|
|
var originalOffset = bytes.CurrentOffset;
|
2017-11-09 19:14:09 +00:00
|
|
|
|
|
2018-01-21 18:08:00 +00:00
|
|
|
|
const string searchTerm = "%%EOF";
|
2017-11-09 19:14:09 +00:00
|
|
|
|
|
2018-01-21 18:08:00 +00:00
|
|
|
|
var minimumEndOffset = bytes.Length - searchTerm.Length;
|
2017-11-09 19:14:09 +00:00
|
|
|
|
|
2018-01-21 18:08:00 +00:00
|
|
|
|
bytes.Seek(minimumEndOffset);
|
2017-11-09 19:14:09 +00:00
|
|
|
|
|
2018-01-21 18:08:00 +00:00
|
|
|
|
while (bytes.CurrentOffset > 0)
|
2017-11-09 19:14:09 +00:00
|
|
|
|
{
|
2018-01-21 18:08:00 +00:00
|
|
|
|
if (ReadHelper.IsString(bytes, searchTerm))
|
2017-11-09 19:14:09 +00:00
|
|
|
|
{
|
2018-01-21 18:08:00 +00:00
|
|
|
|
var position = bytes.CurrentOffset;
|
|
|
|
|
|
|
|
|
|
bytes.Seek(originalOffset);
|
|
|
|
|
|
2017-11-09 19:14:09 +00:00
|
|
|
|
return position;
|
|
|
|
|
}
|
|
|
|
|
|
2018-01-21 18:08:00 +00:00
|
|
|
|
bytes.Seek(minimumEndOffset--);
|
2017-11-09 19:14:09 +00:00
|
|
|
|
}
|
|
|
|
|
|
2018-01-21 18:08:00 +00:00
|
|
|
|
bytes.Seek(originalOffset);
|
2017-11-09 19:14:09 +00:00
|
|
|
|
return long.MaxValue;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|