#62 use length value of stream directly to read the full stream once

This commit is contained in:
Eliot Jones
2019-08-20 21:08:06 +01:00
parent e0a32a701b
commit bbe5409f94
5 changed files with 325 additions and 1 deletions

View File

@@ -1,6 +1,7 @@
namespace UglyToad.PdfPig.Tests.IO
{
using System.IO;
using System.Linq;
using PdfPig.IO;
using PdfPig.Util;
using Xunit;
@@ -72,5 +73,158 @@
Assert.False(array.IsAtEnd());
}
}
[Fact]
public void ReadFromBeginningIsCorrect()
{
var bytes = StringToBytes("endstream and then <</go[]>>");
var buffer = new byte["endstream".Length];
var result = bytes.Read(buffer);
Assert.Equal(buffer.Length, result);
Assert.Equal("endstream", OtherEncodings.BytesAsLatin1String(buffer));
Assert.Equal((byte)'m', bytes.CurrentByte);
Assert.True(bytes.MoveNext());
Assert.True(bytes.MoveNext());
Assert.Equal((byte)'a', bytes.CurrentByte);
}
[Fact]
public void ReadMatchesMoveBehaviour()
{
var bytesRead = StringToBytes("cows in the south");
var bytesMove = StringToBytes("cows in the north");
const int readLength = 3;
var buffer = new byte[readLength];
var readResult = bytesRead.Read(buffer);
for (var i = 0; i < readLength; i++)
{
bytesMove.MoveNext();
}
Assert.Equal(readLength, readResult);
Assert.Equal(bytesRead.CurrentOffset, bytesMove.CurrentOffset);
Assert.Equal(bytesRead.CurrentByte, bytesMove.CurrentByte);
Assert.Equal(bytesRead.MoveNext(), bytesMove.MoveNext());
Assert.Equal(bytesRead.CurrentOffset, bytesMove.CurrentOffset);
Assert.Equal(bytesRead.CurrentByte, bytesMove.CurrentByte);
}
[Fact]
public void ReadFromMiddleIsCorrect()
{
var bytes = StringToBytes("aa stream <<>>");
Assert.True(bytes.MoveNext());
Assert.True(bytes.MoveNext());
Assert.True(bytes.MoveNext());
Assert.Equal((byte)' ', bytes.CurrentByte);
var buffer = new byte["stream".Length];
var result = bytes.Read(buffer);
Assert.Equal(buffer.Length, result);
Assert.Equal("stream", OtherEncodings.BytesAsLatin1String(buffer));
Assert.Equal((byte)'m', bytes.CurrentByte);
Assert.True(bytes.MoveNext());
Assert.True(bytes.MoveNext());
Assert.Equal((byte)'<', bytes.CurrentByte);
}
[Fact]
public void ReadPastEndIsCorrect()
{
var bytes = StringToBytes("stream");
Assert.True(bytes.MoveNext());
Assert.True(bytes.MoveNext());
var buffer = new byte["stream".Length];
var result = bytes.Read(buffer);
Assert.Equal(buffer.Length - 2, result);
Assert.Equal("ream", OtherEncodings.BytesAsLatin1String(buffer.Take(buffer.Length - 2).ToArray()));
Assert.Equal((byte)'m', bytes.CurrentByte);
Assert.True(bytes.IsAtEnd());
Assert.False(bytes.MoveNext());
}
[Fact]
public void ReadFromStreamBeginningIsCorrect()
{
var stream = StringToStream("endstream and then <</go[]>>");
var buffer = new byte["endstream".Length];
var result = stream.Read(buffer);
Assert.Equal(buffer.Length, result);
Assert.Equal("endstream", OtherEncodings.BytesAsLatin1String(buffer));
Assert.Equal((byte)'m', stream.CurrentByte);
Assert.True(stream.MoveNext());
Assert.True(stream.MoveNext());
Assert.Equal((byte)'a', stream.CurrentByte);
}
[Fact]
public void ReadFromStreamMiddleIsCorrect()
{
var stream = StringToStream("aa stream <<>>");
Assert.True(stream.MoveNext());
Assert.True(stream.MoveNext());
Assert.True(stream.MoveNext());
Assert.Equal((byte)' ', stream.CurrentByte);
var buffer = new byte["stream".Length];
var result = stream.Read(buffer);
Assert.Equal(buffer.Length, result);
Assert.Equal("stream", OtherEncodings.BytesAsLatin1String(buffer));
Assert.Equal((byte)'m', stream.CurrentByte);
Assert.True(stream.MoveNext());
Assert.True(stream.MoveNext());
Assert.Equal((byte)'<', stream.CurrentByte);
}
[Fact]
public void ReadPastStreamEndIsCorrect()
{
var stream = StringToStream("stream");
Assert.True(stream.MoveNext());
Assert.True(stream.MoveNext());
var buffer = new byte["stream".Length];
var result = stream.Read(buffer);
Assert.Equal(buffer.Length - 2, result);
Assert.Equal("ream", OtherEncodings.BytesAsLatin1String(buffer.Take(buffer.Length - 2).ToArray()));
Assert.Equal((byte)'m', stream.CurrentByte);
Assert.True(stream.IsAtEnd());
Assert.False(stream.MoveNext());
}
private static ByteArrayInputBytes StringToBytes(string str) => new ByteArrayInputBytes(OtherEncodings.StringAsLatin1Bytes(str));
private static StreamInputBytes StringToStream(string str) => new StreamInputBytes(new MemoryStream(OtherEncodings.StringAsLatin1Bytes(str)));
}
}

View File

@@ -1,5 +1,6 @@
namespace UglyToad.PdfPig.IO
{
using System;
using System.Collections.Generic;
using System.Diagnostics;
@@ -54,6 +55,46 @@
CurrentByte = currentOffset < 0 ? (byte)0 : bytes[(int)currentOffset];
}
public int Read(byte[] buffer, int? length = null)
{
var bytesToRead = buffer.Length;
if (length.HasValue)
{
if (length.Value < 0)
{
throw new ArgumentOutOfRangeException($"Cannot use a negative length: {length.Value}.");
}
if (length.Value > bytesToRead)
{
throw new ArgumentOutOfRangeException($"Cannot read more bytes {length.Value} than there is space in the buffer {buffer.Length}.");
}
bytesToRead = length.Value;
}
if (bytesToRead == 0)
{
return 0;
}
var viableLength = (bytes.Count - currentOffset - 1);
var readLength = (int)(viableLength < bytesToRead ? viableLength : bytesToRead);
var startFrom = (int)currentOffset;
for (var i = 0; i < readLength; i++)
{
buffer[i] = bytes[startFrom + i + 1];
}
if (readLength > 0)
{
currentOffset += readLength;
CurrentByte = buffer[readLength - 1];
}
return readLength;
}
public void Dispose()
{
}

View File

@@ -42,5 +42,13 @@
/// Move to a given position.
/// </summary>
void Seek(long position);
/// <summary>
/// Fill the buffer with bytes starting from the current position.
/// </summary>
/// <param name="buffer">A buffer with a length corresponding to the number of bytes to read.</param>
/// <param name="length">Optional override for the number of bytes to read.</param>
/// <returns>The number of bytes successfully read.</returns>
int Read(byte[] buffer, int? length = null);
}
}

View File

@@ -89,6 +89,40 @@
}
}
public int Read(byte[] buffer, int? length = null)
{
var bytesToRead = buffer.Length;
if (length.HasValue)
{
if (length.Value < 0)
{
throw new ArgumentOutOfRangeException($"Cannot use a negative length: {length.Value}.");
}
if (length.Value > bytesToRead)
{
throw new ArgumentOutOfRangeException($"Cannot read more bytes {length.Value} than there is space in the buffer {buffer.Length}.");
}
bytesToRead = length.Value;
}
if (bytesToRead == 0)
{
return 0;
}
var read = stream.Read(buffer, 0, bytesToRead);
if (read > 0)
{
CurrentByte = buffer[read - 1];
}
isAtEnd = stream.Position == stream.Length;
return read;
}
public void Dispose()
{
if (shouldDispose)

View File

@@ -28,6 +28,11 @@
internal class PdfTokenScanner : IPdfTokenScanner
{
private static readonly byte[] EndstreamBytes =
{
(byte)'e', (byte)'n', (byte)'d', (byte)'s', (byte)'t', (byte)'r', (byte)'e', (byte)'a', (byte)'m'
};
private static readonly Regex EndsWithNumberRegex = new Regex(@"(?<=^[^\s\d]+)\d+$");
private readonly IInputBytes inputBytes;
@@ -256,6 +261,12 @@
const string streamPart = "stream";
const string objPart = "obj";
if (TryReadUsingLength(inputBytes, length, startDataOffset, out var streamData))
{
stream = new StreamToken(streamDictionaryToken, streamData);
return true;
}
// Track any 'endobj' or 'endstream' operators we see.
var observedEndLocations = new List<PossibleStreamEndLocation>();
@@ -404,6 +415,82 @@
return true;
}
private static bool TryReadUsingLength(IInputBytes inputBytes, long? length, long startDataOffset, out byte[] data)
{
data = null;
if (!length.HasValue || length.Value + startDataOffset >= inputBytes.Length)
{
return false;
}
var readBuffer = new byte[EndstreamBytes.Length];
var newlineCount = 0;
inputBytes.Seek(length.Value + startDataOffset);
var next = inputBytes.Peek();
if (next.HasValue && ReadHelper.IsEndOfLine(next.Value))
{
newlineCount++;
inputBytes.MoveNext();
next = inputBytes.Peek();
if (next.HasValue && ReadHelper.IsEndOfLine(next.Value))
{
newlineCount++;
inputBytes.MoveNext();
}
}
var readLength = inputBytes.Read(readBuffer);
if (readLength != readBuffer.Length)
{
return false;
}
for (var i = 0; i < EndstreamBytes.Length; i++)
{
if (readBuffer[i] != EndstreamBytes[i])
{
inputBytes.Seek(startDataOffset);
return false;
}
}
inputBytes.Seek(startDataOffset);
data = new byte[(int)length.Value];
var countRead = inputBytes.Read(data);
if (countRead != data.Length)
{
throw new InvalidOperationException($"Reading using the stream length failed to read as many bytes as the stream specified. Wanted {length.Value}, got {countRead} at {startDataOffset + 1}.");
}
inputBytes.Read(readBuffer);
// Skip for the line break before 'endstream'.
for (var i = 0; i < newlineCount; i++)
{
var read = inputBytes.MoveNext();
if (!read)
{
inputBytes.Seek(startDataOffset);
return false;
}
}
// 1 skip to move past the 'm' in 'endstream'
inputBytes.MoveNext();
return true;
}
private DictionaryToken GetStreamDictionary()
{
DictionaryToken streamDictionaryToken;