namespace UglyToad.Pdf.Parser.Parts
{
using System;
using System.Collections.Generic;
using Cos;
using IO;
using Util;
using Util.JetBrains.Annotations;
///
/// Store the results of a brute force search for all Cos Objects in the document so we only do it once.
///
internal class BruteForceSearcher
{
private const int MinimumSearchOffset = 6;
private readonly IRandomAccessRead reader;
private Dictionary objectLocations;
public BruteForceSearcher([NotNull] IRandomAccessRead reader)
{
this.reader = reader ?? throw new ArgumentNullException(nameof(reader));
}
[NotNull]
public IReadOnlyDictionary GetObjectLocations()
{
if (objectLocations != null)
{
return objectLocations;
}
var lastEndOfFile = GetLastEndOfFileMarker();
var results = new Dictionary();
var originPosition = reader.GetPosition();
long currentOffset = MinimumSearchOffset;
long lastObjectId = long.MinValue;
int lastGenerationId = int.MinValue;
long lastObjOffset = long.MinValue;
byte[] objString = OtherEncodings.StringAsLatin1Bytes(" obj");
byte[] endobjString = OtherEncodings.StringAsLatin1Bytes("endobj");
bool endobjFound = false;
do
{
reader.Seek(currentOffset);
if (ReadHelper.IsString(reader, objString))
{
long tempOffset = currentOffset - 1;
reader.Seek(tempOffset);
int generationId = reader.Peek();
// is the next char a digit?
if (ReadHelper.IsDigit(generationId))
{
generationId -= 48;
tempOffset--;
reader.Seek(tempOffset);
if (ReadHelper.IsSpace(reader))
{
while (tempOffset > MinimumSearchOffset && ReadHelper.IsSpace(reader))
{
reader.Seek(--tempOffset);
}
bool objectIdFound = false;
while (tempOffset > MinimumSearchOffset && ReadHelper.IsDigit(reader))
{
reader.Seek(--tempOffset);
objectIdFound = true;
}
if (objectIdFound)
{
reader.Read();
long objectId = ObjectHelper.ReadObjectNumber(reader);
if (lastObjOffset > 0)
{
// add the former object ID only if there was a subsequent object ID
results[new CosObjectKey(lastObjectId, lastGenerationId)] = lastObjOffset;
}
lastObjectId = objectId;
lastGenerationId = generationId;
lastObjOffset = tempOffset + 1;
currentOffset += objString.Length - 1;
endobjFound = false;
}
}
}
}
else if (ReadHelper.IsString(reader, "endobj"))
{
endobjFound = true;
currentOffset += endobjString.Length - 1;
}
currentOffset++;
} while (currentOffset < lastEndOfFile && !reader.IsEof());
if ((lastEndOfFile < long.MaxValue || endobjFound) && lastObjOffset > 0)
{
// if the pdf wasn't cut off in the middle or if the last object ends with a "endobj" marker
// the last object id has to be added here so that it can't get lost as there isn't any subsequent object id
results[new CosObjectKey(lastObjectId, lastGenerationId)] = lastObjOffset;
}
// reestablish origin position
reader.Seek(originPosition);
objectLocations = results;
return objectLocations;
}
private long GetLastEndOfFileMarker()
{
var originalOffset = reader.GetPosition();
var searchTerm = OtherEncodings.StringAsLatin1Bytes("%%EOF");
var minimumEndOffset = reader.Length() - searchTerm.Length;
reader.Seek(minimumEndOffset);
while (reader.GetPosition() > 0)
{
if (ReadHelper.IsString(reader, searchTerm))
{
var position = reader.GetPosition();
reader.Seek(originalOffset);
return position;
}
reader.Seek(minimumEndOffset--);
}
reader.Seek(originalOffset);
return long.MaxValue;
}
}
}