2018-01-11 03:49:32 +08:00
|
|
|
|
namespace UglyToad.PdfPig.Parser.FileStructure
|
2017-11-10 03:14:09 +08:00
|
|
|
|
{
|
|
|
|
|
using System;
|
|
|
|
|
using System.Collections.Generic;
|
|
|
|
|
using System.Linq;
|
2020-01-05 00:38:18 +08:00
|
|
|
|
using Core;
|
2018-11-25 03:02:06 +08:00
|
|
|
|
using CrossReference;
|
2018-01-22 02:08:00 +08:00
|
|
|
|
using Logging;
|
2018-01-05 05:09:47 +08:00
|
|
|
|
using Parts;
|
2017-11-10 03:14:09 +08:00
|
|
|
|
|
|
|
|
|
internal class XrefCosOffsetChecker
|
|
|
|
|
{
|
|
|
|
|
private static readonly long MINIMUM_SEARCH_OFFSET = 6;
|
|
|
|
|
|
2018-01-22 02:08:00 +08:00
|
|
|
|
private readonly ILog log;
|
|
|
|
|
private readonly BruteForceSearcher bruteForceSearcher;
|
|
|
|
|
|
|
|
|
|
private IReadOnlyDictionary<IndirectReference, long> objectKeyOffsets;
|
|
|
|
|
|
|
|
|
|
public XrefCosOffsetChecker(ILog log, BruteForceSearcher bruteForceSearcher)
|
|
|
|
|
{
|
|
|
|
|
this.log = log;
|
|
|
|
|
this.bruteForceSearcher = bruteForceSearcher;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private bool ValidateXrefOffsets(IInputBytes bytes, Dictionary<IndirectReference, long> xrefOffset)
|
2017-11-10 03:14:09 +08:00
|
|
|
|
{
|
|
|
|
|
if (xrefOffset == null)
|
|
|
|
|
{
|
|
|
|
|
return true;
|
|
|
|
|
}
|
2018-01-22 02:08:00 +08:00
|
|
|
|
|
2017-11-10 03:14:09 +08:00
|
|
|
|
foreach (var objectEntry in xrefOffset)
|
|
|
|
|
{
|
2018-01-14 22:48:54 +08:00
|
|
|
|
IndirectReference objectKey = objectEntry.Key;
|
2017-11-10 03:14:09 +08:00
|
|
|
|
long objectOffset = objectEntry.Value;
|
2018-01-22 02:08:00 +08:00
|
|
|
|
|
2017-11-10 03:14:09 +08:00
|
|
|
|
// a negative offset number represents a object number itself
|
|
|
|
|
// see type 2 entry in xref stream
|
2018-01-22 02:08:00 +08:00
|
|
|
|
if (objectOffset >= 0 && !CheckObjectKeys(bytes, objectKey, objectOffset))
|
2017-11-10 03:14:09 +08:00
|
|
|
|
{
|
2018-01-22 02:08:00 +08:00
|
|
|
|
log.Debug($"Stop checking xref offsets as at least one ({objectKey}) couldn't be dereferenced");
|
|
|
|
|
|
2017-11-10 03:14:09 +08:00
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2018-01-22 02:08:00 +08:00
|
|
|
|
private bool CheckObjectKeys(IInputBytes bytes, IndirectReference objectKey, long offset)
|
2017-11-10 03:14:09 +08:00
|
|
|
|
{
|
|
|
|
|
// there can't be any object at the very beginning of a pdf
|
|
|
|
|
if (offset < MINIMUM_SEARCH_OFFSET)
|
|
|
|
|
{
|
|
|
|
|
return false;
|
|
|
|
|
}
|
2018-01-22 02:08:00 +08:00
|
|
|
|
|
2018-01-14 22:48:54 +08:00
|
|
|
|
long objectNr = objectKey.ObjectNumber;
|
2017-11-10 03:14:09 +08:00
|
|
|
|
long objectGen = objectKey.Generation;
|
2018-01-22 02:08:00 +08:00
|
|
|
|
long originOffset = bytes.CurrentOffset;
|
|
|
|
|
|
|
|
|
|
string objectString = ObjectHelper.CreateObjectString(objectNr, objectGen);
|
|
|
|
|
|
2017-11-10 03:14:09 +08:00
|
|
|
|
try
|
|
|
|
|
{
|
2018-01-22 02:08:00 +08:00
|
|
|
|
bytes.Seek(offset);
|
|
|
|
|
|
|
|
|
|
if (ReadHelper.IsWhitespace(bytes.CurrentByte))
|
|
|
|
|
{
|
|
|
|
|
bytes.MoveNext();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (ReadHelper.IsString(bytes, objectString))
|
2017-11-10 03:14:09 +08:00
|
|
|
|
{
|
|
|
|
|
// everything is ok, return origin object key
|
2018-01-22 02:08:00 +08:00
|
|
|
|
bytes.Seek(originOffset);
|
2017-11-10 03:14:09 +08:00
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
}
|
2018-01-22 02:08:00 +08:00
|
|
|
|
catch (Exception)
|
2017-11-10 03:14:09 +08:00
|
|
|
|
{
|
|
|
|
|
// Swallow the exception, obviously there isn't any valid object number
|
|
|
|
|
}
|
|
|
|
|
finally
|
|
|
|
|
{
|
2018-01-22 02:08:00 +08:00
|
|
|
|
bytes.Seek(originOffset);
|
2017-11-10 03:14:09 +08:00
|
|
|
|
}
|
2018-01-22 02:08:00 +08:00
|
|
|
|
|
2017-11-10 03:14:09 +08:00
|
|
|
|
// no valid object number found
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
2018-01-22 02:08:00 +08:00
|
|
|
|
private IReadOnlyDictionary<IndirectReference, long> getBFCosObjectOffsets()
|
2017-11-10 03:14:09 +08:00
|
|
|
|
{
|
2018-01-22 02:08:00 +08:00
|
|
|
|
if (objectKeyOffsets == null)
|
2017-11-10 03:14:09 +08:00
|
|
|
|
{
|
2018-01-22 02:08:00 +08:00
|
|
|
|
var offsets = bruteForceSearcher.GetObjectLocations();
|
2017-11-10 03:14:09 +08:00
|
|
|
|
|
2018-01-22 02:08:00 +08:00
|
|
|
|
objectKeyOffsets = offsets;
|
2017-11-10 03:14:09 +08:00
|
|
|
|
}
|
|
|
|
|
|
2018-01-22 02:08:00 +08:00
|
|
|
|
return objectKeyOffsets;
|
2017-11-10 03:14:09 +08:00
|
|
|
|
}
|
2018-01-22 02:08:00 +08:00
|
|
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
|
/// Check that the offsets in the cross reference are correct.
|
|
|
|
|
/// </summary>
|
|
|
|
|
public void CheckCrossReferenceOffsets(IInputBytes bytes, CrossReferenceTable xrefTrailerResolver, bool isLenientParsing)
|
2017-11-10 03:14:09 +08:00
|
|
|
|
{
|
|
|
|
|
// repair mode isn't available in non-lenient mode
|
|
|
|
|
if (!isLenientParsing)
|
|
|
|
|
{
|
|
|
|
|
return;
|
|
|
|
|
}
|
2018-01-22 02:08:00 +08:00
|
|
|
|
|
2018-01-14 22:48:54 +08:00
|
|
|
|
Dictionary<IndirectReference, long> xrefOffset = xrefTrailerResolver.ObjectOffsets.ToDictionary(x => x.Key, x => x.Value);
|
2018-01-22 02:08:00 +08:00
|
|
|
|
if (ValidateXrefOffsets(bytes, xrefOffset))
|
2017-11-10 03:14:09 +08:00
|
|
|
|
{
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
2018-01-22 02:08:00 +08:00
|
|
|
|
IReadOnlyDictionary<IndirectReference, long> bfCOSObjectKeyOffsets = getBFCosObjectOffsets();
|
2017-11-10 03:14:09 +08:00
|
|
|
|
if (bfCOSObjectKeyOffsets.Count > 0)
|
|
|
|
|
{
|
2018-01-14 22:48:54 +08:00
|
|
|
|
List<IndirectReference> objStreams = new List<IndirectReference>();
|
2017-11-10 03:14:09 +08:00
|
|
|
|
// find all object streams
|
|
|
|
|
foreach (var entry in xrefOffset)
|
|
|
|
|
{
|
|
|
|
|
long offset = entry.Value;
|
2018-01-07 20:37:48 +08:00
|
|
|
|
if (offset < 0)
|
2017-11-10 03:14:09 +08:00
|
|
|
|
{
|
2018-01-14 22:48:54 +08:00
|
|
|
|
IndirectReference objStream = new IndirectReference(-offset, 0);
|
2017-11-10 03:14:09 +08:00
|
|
|
|
if (!objStreams.Contains(objStream))
|
|
|
|
|
{
|
2018-01-14 22:48:54 +08:00
|
|
|
|
objStreams.Add(new IndirectReference(-offset, 0));
|
2017-11-10 03:14:09 +08:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
// remove all found object streams
|
|
|
|
|
if (objStreams.Count > 0)
|
|
|
|
|
{
|
2018-01-14 22:48:54 +08:00
|
|
|
|
foreach (IndirectReference key in objStreams)
|
2017-11-10 03:14:09 +08:00
|
|
|
|
{
|
|
|
|
|
if (bfCOSObjectKeyOffsets.ContainsKey(key))
|
|
|
|
|
{
|
|
|
|
|
// remove all parsed objects which are part of an object stream
|
|
|
|
|
//ISet<long> objects = xrefTrailerResolver
|
|
|
|
|
// .getContainedObjectNumbers((int)(key.Number));
|
|
|
|
|
//foreach (long objNr in objects)
|
|
|
|
|
//{
|
|
|
|
|
// CosObjectKey streamObjectKey = new CosObjectKey(objNr, 0);
|
|
|
|
|
|
|
|
|
|
// if (bfCOSObjectKeyOffsets.TryGetValue(streamObjectKey, out long streamObjectOffset) && streamObjectOffset > 0)
|
|
|
|
|
// {
|
|
|
|
|
// bfCOSObjectKeyOffsets.Remove(streamObjectKey);
|
|
|
|
|
// }
|
|
|
|
|
//}
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
// remove all objects which are part of an object stream which wasn't found
|
|
|
|
|
//ISet<long> objects = xrefTrailerResolver
|
|
|
|
|
// .getContainedObjectNumbers((int)(key.Number));
|
|
|
|
|
//foreach (long objNr in objects)
|
|
|
|
|
//{
|
|
|
|
|
// xrefOffset.Remove(new CosObjectKey(objNr, 0));
|
|
|
|
|
//}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
foreach (var item in bfCOSObjectKeyOffsets)
|
|
|
|
|
{
|
2018-01-22 02:08:00 +08:00
|
|
|
|
xrefOffset[item.Key] = item.Value;
|
2017-11-10 03:14:09 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2018-01-22 02:08:00 +08:00
|
|
|
|
private long? lastEndOfFileMarker;
|
|
|
|
|
|
|
|
|
|
private void BruteForceSearchForEndOfFileMarker(IInputBytes source)
|
2017-11-10 03:14:09 +08:00
|
|
|
|
{
|
2018-01-22 02:08:00 +08:00
|
|
|
|
if (lastEndOfFileMarker != null)
|
|
|
|
|
{
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
long startOffset = source.CurrentOffset;
|
|
|
|
|
|
|
|
|
|
source.Seek(MINIMUM_SEARCH_OFFSET);
|
|
|
|
|
|
|
|
|
|
while (!source.IsAtEnd())
|
2017-11-10 03:14:09 +08:00
|
|
|
|
{
|
2018-01-22 02:08:00 +08:00
|
|
|
|
// search for EOF marker
|
|
|
|
|
if (ReadHelper.IsString(source, "%%EOF"))
|
2017-11-10 03:14:09 +08:00
|
|
|
|
{
|
2018-01-22 02:08:00 +08:00
|
|
|
|
long tempMarker = source.CurrentOffset;
|
|
|
|
|
|
|
|
|
|
if (tempMarker >= source.Length)
|
|
|
|
|
{
|
|
|
|
|
lastEndOfFileMarker = tempMarker;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
try
|
2017-11-10 03:14:09 +08:00
|
|
|
|
{
|
|
|
|
|
source.Seek(tempMarker + 5);
|
2018-01-22 02:08:00 +08:00
|
|
|
|
// check if the following data is some valid pdf content
|
|
|
|
|
// which most likely indicates that the pdf is linearized,
|
|
|
|
|
// updated or just cut off somewhere in the middle
|
|
|
|
|
ReadHelper.SkipSpaces(source);
|
|
|
|
|
ObjectHelper.ReadObjectNumber(source);
|
|
|
|
|
ObjectHelper.ReadGenerationNumber(source);
|
|
|
|
|
}
|
|
|
|
|
catch (Exception)
|
|
|
|
|
{
|
|
|
|
|
// save the EOF marker as the following data is most likely some garbage
|
|
|
|
|
lastEndOfFileMarker = tempMarker;
|
2017-11-10 03:14:09 +08:00
|
|
|
|
}
|
|
|
|
|
}
|
2018-01-22 02:08:00 +08:00
|
|
|
|
|
|
|
|
|
source.MoveNext();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
source.Seek(startOffset);
|
|
|
|
|
|
|
|
|
|
// no EOF marker found
|
|
|
|
|
if (lastEndOfFileMarker == null)
|
|
|
|
|
{
|
|
|
|
|
lastEndOfFileMarker = long.MaxValue;
|
2017-11-10 03:14:09 +08:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|