- Fix of Stream invalid Length issue (causing stream data not being fully read).

- Improve Stream read performance by simplifying TryReadStream(), avoiding use of MemoryStream, with benefice of already existing Memory Span of "inputBytes"
This commit is contained in:
Sylvain Bruyere 2024-05-21 13:52:07 +02:00
parent d86c2f44f0
commit bb5a757e8c

View File

@ -1,4 +1,7 @@
namespace UglyToad.PdfPig.Tokenization.Scanner
//TODO: https://blog.cerbero.io/cve-2010-0188-pdfformtiff/
// https://beta-v1.malva.re/file/f1e966769e544b4b67f203ca51909b9d/report
namespace UglyToad.PdfPig.Tokenization.Scanner
{
using System;
using System.Collections.Generic;
@ -7,6 +10,7 @@
using System.Globalization;
using System.IO;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using Core;
using Encryption;
@ -320,7 +324,7 @@
int endStreamPosition = 0;
int commonPartPosition = 0;
const string commonPart = "end";
const string endWordPart = "end";
const string streamPart = "stream";
const string objPart = "obj";
@ -330,151 +334,130 @@
return true;
}
// Track any 'endobj' or 'endstream' operators we see.
var observedEndLocations = new List<PossibleStreamEndLocation>();
long streamDataStart = inputBytes.CurrentOffset;
// Begin reading the stream.
using (var memoryStream = new MemoryStream())
using (var binaryWrite = new BinaryWriter(memoryStream))
PossibleStreamEndLocation? possibleEndLocation = null;
while (inputBytes.MoveNext())
{
while (inputBytes.MoveNext())
if (length.HasValue && read == length)
{
if (length.HasValue && read == length)
{
// TODO: read ahead and check we're at the end...
// break;
}
// TODO: read ahead and check we're at the end...
// break;
}
// We are reading 'end' (possibly).
if (commonPartPosition < commonPart.Length && inputBytes.CurrentByte == commonPart[commonPartPosition])
// We are reading 'end' (possibly).
if (commonPartPosition < endWordPart.Length && inputBytes.CurrentByte == endWordPart[commonPartPosition])
{
commonPartPosition++;
}
else if (commonPartPosition == endWordPart.Length)
{
// We are reading 'stream' after 'end'
if (inputBytes.CurrentByte == streamPart[endStreamPosition])
{
commonPartPosition++;
}
else if (commonPartPosition == commonPart.Length)
{
// We are reading 'stream' after 'end'
if (inputBytes.CurrentByte == streamPart[endStreamPosition])
endObjPosition = 0;
endStreamPosition++;
// We've finished reading 'endstream', add it to the end tokens we've seen.
if (endStreamPosition == streamPart.Length && (!inputBytes.MoveNext() || ReadHelper.IsWhitespace(inputBytes.CurrentByte)))
{
endObjPosition = 0;
endStreamPosition++;
var token = new PossibleStreamEndLocation(inputBytes.CurrentOffset - OperatorToken.EndStream.Data.Length, OperatorToken.EndStream);
// We've finished reading 'endstream', add it to the end tokens we've seen.
if (endStreamPosition == streamPart.Length && (!inputBytes.MoveNext() || ReadHelper.IsWhitespace(inputBytes.CurrentByte)))
possibleEndLocation = token;
//observedEndLocations.Add(token);
if (length.HasValue && read > length)
{
var token = new PossibleStreamEndLocation(inputBytes.CurrentOffset - OperatorToken.EndStream.Data.Length, OperatorToken.EndStream);
observedEndLocations.Add(token);
if (length.HasValue && read > length)
{
break;
}
endStreamPosition = 0;
break;
}
}
else if (inputBytes.CurrentByte == objPart[endObjPosition])
{
// We are reading 'obj' after 'end'
endStreamPosition = 0;
endObjPosition++;
// We have finished reading 'endobj'.
if (endObjPosition == objPart.Length)
{
// If we saw an 'endstream' or 'endobj' previously we've definitely hit the end now.
if (observedEndLocations.Count > 0)
{
var lastEndToken = observedEndLocations[observedEndLocations.Count - 1];
inputBytes.Seek(lastEndToken.Offset + lastEndToken.Type.Data.Length + 1);
break;
}
var token = new PossibleStreamEndLocation(inputBytes.CurrentOffset - OperatorToken.EndObject.Data.Length, OperatorToken.EndObject);
observedEndLocations.Add(token);
if (read > length)
{
break;
}
}
}
else
{
// We were reading 'end' but then we had a character mismatch.
// Reset all the counters.
}
else if (inputBytes.CurrentByte == objPart[endObjPosition])
{
// We are reading 'obj' after 'end'
endStreamPosition = 0;
endObjPosition = 0;
commonPartPosition = 0;
endStreamPosition = 0;
endObjPosition++;
// We have finished reading 'endobj'.
if (endObjPosition == objPart.Length)
{
// If we saw an 'endstream' or 'endobj' previously we've definitely hit the end now.
if (possibleEndLocation != null)
{
var lastEndToken = possibleEndLocation.Value; //observedEndLocations[observedEndLocations.Count - 1];
inputBytes.Seek(lastEndToken.Offset + lastEndToken.Type.Data.Length + 1);
//streamDataEnd = lastEndToken.Offset + lastEndToken.Type.Data.Length + 1;
break;
}
var token = new PossibleStreamEndLocation(inputBytes.CurrentOffset - OperatorToken.EndObject.Data.Length, OperatorToken.EndObject);
possibleEndLocation = token;
if (read > length)
{
break;
}
}
}
else
{
// For safety reset every counter in case we had a partial read.
// We were reading 'end' but then we had a character mismatch.
// Reset all the counters.
endStreamPosition = 0;
endObjPosition = 0;
commonPartPosition = (inputBytes.CurrentByte == commonPart[0]) ? 1 : 0;
commonPartPosition = 0;
}
binaryWrite.Write(inputBytes.CurrentByte);
read++;
}
binaryWrite.Flush();
if (observedEndLocations.Count == 0)
{
return false;
}
memoryStream.Seek(0, SeekOrigin.Begin);
if (length.HasValue && memoryStream.Length >= length)
{
// Use the declared length to copy just the data we want.
byte[] data = new byte[length.Value];
memoryStream.Read(data, 0, (int)length.Value);
stream = new StreamToken(streamDictionaryToken, data);
}
else
{
// Work out where '\r\nendobj' or '\r\nendstream' occurs and read everything up to that.
var lastEnd = observedEndLocations[observedEndLocations.Count - 1];
// For safety reset every counter in case we had a partial read.
var dataLength = lastEnd.Offset - startDataOffset;
var current = inputBytes.CurrentOffset;
// 3 characters, 'e', '\n' and possibly '\r'
inputBytes.Seek(lastEnd.Offset - 3);
inputBytes.MoveNext();
if (inputBytes.CurrentByte == '\r')
{
dataLength -= 3;
}
else
{
dataLength -= 2;
}
inputBytes.Seek(current);
byte[] data = new byte[dataLength];
memoryStream.Read(data, 0, (int)dataLength);
stream = new StreamToken(streamDictionaryToken, data);
endStreamPosition = 0;
endObjPosition = 0;
commonPartPosition = (inputBytes.CurrentByte == endWordPart[0]) ? 1 : 0;
}
read++;
}
long streamDataEnd = inputBytes.CurrentOffset + 1;
if (possibleEndLocation == null)
return false;
var lastEnd = possibleEndLocation;
var dataLength = lastEnd.Value.Offset - startDataOffset;
// 3 characters, 'e', '\n' and possibly '\r'
inputBytes.Seek(lastEnd.Value.Offset - 3);
inputBytes.MoveNext();
if (inputBytes.CurrentByte == '\r')
{
dataLength -= 3;
}
else
{
dataLength -= 2;
}
Span<byte> data = new byte[dataLength];
inputBytes.Seek(streamDataStart);
inputBytes.Read(data);
inputBytes.Seek(streamDataEnd);
stream = new StreamToken(streamDictionaryToken, data.ToArray());
return true;
}