PdfPig/src/UglyToad.PdfPig/Parser/FileStructure/XrefCosOffsetChecker.cs

237 lines
8.1 KiB
C#
Raw Normal View History

namespace UglyToad.PdfPig.Parser.FileStructure
{
using System;
using System.Collections.Generic;
using System.Linq;
using Core;
using CrossReference;
2018-01-22 02:08:00 +08:00
using Logging;
using Parts;
internal class XrefCosOffsetChecker
{
private static readonly long MINIMUM_SEARCH_OFFSET = 6;
2018-01-22 02:08:00 +08:00
private readonly ILog log;
private readonly BruteForceSearcher bruteForceSearcher;
private IReadOnlyDictionary<IndirectReference, long> objectKeyOffsets;
public XrefCosOffsetChecker(ILog log, BruteForceSearcher bruteForceSearcher)
{
this.log = log;
this.bruteForceSearcher = bruteForceSearcher;
}
private bool ValidateXrefOffsets(IInputBytes bytes, Dictionary<IndirectReference, long> xrefOffset)
{
if (xrefOffset == null)
{
return true;
}
2018-01-22 02:08:00 +08:00
foreach (var objectEntry in xrefOffset)
{
IndirectReference objectKey = objectEntry.Key;
long objectOffset = objectEntry.Value;
2018-01-22 02:08:00 +08:00
// a negative offset number represents a object number itself
// see type 2 entry in xref stream
2018-01-22 02:08:00 +08:00
if (objectOffset >= 0 && !CheckObjectKeys(bytes, objectKey, objectOffset))
{
2018-01-22 02:08:00 +08:00
log.Debug($"Stop checking xref offsets as at least one ({objectKey}) couldn't be dereferenced");
return false;
}
}
return true;
}
2018-01-22 02:08:00 +08:00
private bool CheckObjectKeys(IInputBytes bytes, IndirectReference objectKey, long offset)
{
// there can't be any object at the very beginning of a pdf
if (offset < MINIMUM_SEARCH_OFFSET)
{
return false;
}
2018-01-22 02:08:00 +08:00
long objectNr = objectKey.ObjectNumber;
long objectGen = objectKey.Generation;
2018-01-22 02:08:00 +08:00
long originOffset = bytes.CurrentOffset;
string objectString = ObjectHelper.CreateObjectString(objectNr, objectGen);
try
{
2018-01-22 02:08:00 +08:00
bytes.Seek(offset);
if (ReadHelper.IsWhitespace(bytes.CurrentByte))
{
bytes.MoveNext();
}
if (ReadHelper.IsString(bytes, objectString))
{
// everything is ok, return origin object key
2018-01-22 02:08:00 +08:00
bytes.Seek(originOffset);
return true;
}
}
2018-01-22 02:08:00 +08:00
catch (Exception)
{
// Swallow the exception, obviously there isn't any valid object number
}
finally
{
2018-01-22 02:08:00 +08:00
bytes.Seek(originOffset);
}
2018-01-22 02:08:00 +08:00
// no valid object number found
return false;
}
2018-01-22 02:08:00 +08:00
private IReadOnlyDictionary<IndirectReference, long> getBFCosObjectOffsets()
{
2018-01-22 02:08:00 +08:00
if (objectKeyOffsets == null)
{
2018-01-22 02:08:00 +08:00
var offsets = bruteForceSearcher.GetObjectLocations();
2018-01-22 02:08:00 +08:00
objectKeyOffsets = offsets;
}
2018-01-22 02:08:00 +08:00
return objectKeyOffsets;
}
2018-01-22 02:08:00 +08:00
/// <summary>
/// Check that the offsets in the cross reference are correct.
/// </summary>
public void CheckCrossReferenceOffsets(IInputBytes bytes, CrossReferenceTable xrefTrailerResolver, bool isLenientParsing)
{
// repair mode isn't available in non-lenient mode
if (!isLenientParsing)
{
return;
}
2018-01-22 02:08:00 +08:00
Dictionary<IndirectReference, long> xrefOffset = xrefTrailerResolver.ObjectOffsets.ToDictionary(x => x.Key, x => x.Value);
2018-01-22 02:08:00 +08:00
if (ValidateXrefOffsets(bytes, xrefOffset))
{
return;
}
2018-01-22 02:08:00 +08:00
IReadOnlyDictionary<IndirectReference, long> bfCOSObjectKeyOffsets = getBFCosObjectOffsets();
if (bfCOSObjectKeyOffsets.Count > 0)
{
List<IndirectReference> objStreams = new List<IndirectReference>();
// find all object streams
foreach (var entry in xrefOffset)
{
long offset = entry.Value;
if (offset < 0)
{
IndirectReference objStream = new IndirectReference(-offset, 0);
if (!objStreams.Contains(objStream))
{
objStreams.Add(new IndirectReference(-offset, 0));
}
}
}
// remove all found object streams
if (objStreams.Count > 0)
{
foreach (IndirectReference key in objStreams)
{
if (bfCOSObjectKeyOffsets.ContainsKey(key))
{
// remove all parsed objects which are part of an object stream
//ISet<long> objects = xrefTrailerResolver
// .getContainedObjectNumbers((int)(key.Number));
//foreach (long objNr in objects)
//{
// CosObjectKey streamObjectKey = new CosObjectKey(objNr, 0);
// if (bfCOSObjectKeyOffsets.TryGetValue(streamObjectKey, out long streamObjectOffset) && streamObjectOffset > 0)
// {
// bfCOSObjectKeyOffsets.Remove(streamObjectKey);
// }
//}
}
else
{
// remove all objects which are part of an object stream which wasn't found
//ISet<long> objects = xrefTrailerResolver
// .getContainedObjectNumbers((int)(key.Number));
//foreach (long objNr in objects)
//{
// xrefOffset.Remove(new CosObjectKey(objNr, 0));
//}
}
}
}
foreach (var item in bfCOSObjectKeyOffsets)
{
2018-01-22 02:08:00 +08:00
xrefOffset[item.Key] = item.Value;
}
}
}
2018-01-22 02:08:00 +08:00
private long? lastEndOfFileMarker;
private void BruteForceSearchForEndOfFileMarker(IInputBytes source)
{
2018-01-22 02:08:00 +08:00
if (lastEndOfFileMarker != null)
{
return;
}
long startOffset = source.CurrentOffset;
source.Seek(MINIMUM_SEARCH_OFFSET);
while (!source.IsAtEnd())
{
2018-01-22 02:08:00 +08:00
// search for EOF marker
if (ReadHelper.IsString(source, "%%EOF"))
{
2018-01-22 02:08:00 +08:00
long tempMarker = source.CurrentOffset;
if (tempMarker >= source.Length)
{
lastEndOfFileMarker = tempMarker;
break;
}
try
{
source.Seek(tempMarker + 5);
2018-01-22 02:08:00 +08:00
// check if the following data is some valid pdf content
// which most likely indicates that the pdf is linearized,
// updated or just cut off somewhere in the middle
ReadHelper.SkipSpaces(source);
ObjectHelper.ReadObjectNumber(source);
ObjectHelper.ReadGenerationNumber(source);
}
catch (Exception)
{
// save the EOF marker as the following data is most likely some garbage
lastEndOfFileMarker = tempMarker;
}
}
2018-01-22 02:08:00 +08:00
source.MoveNext();
}
source.Seek(startOffset);
// no EOF marker found
if (lastEndOfFileMarker == null)
{
lastEndOfFileMarker = long.MaxValue;
}
}
}
}