support opening from stream and improve performance of brute force searching since the seek operation is now slower.

This commit is contained in:
Eliot Jones
2018-01-21 19:34:21 +00:00
parent 07161cef28
commit c64e54d6c0
18 changed files with 275 additions and 32 deletions

View File

@@ -2,8 +2,8 @@
{
using System;
using System.IO;
using IO;
using PdfPig.Fonts.Parser;
using PdfPig.IO;
using Xunit;
public class AdobeFontMetricsParserTests

View File

@@ -3,8 +3,8 @@
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using IO;
using PdfPig.Fonts.Parser;
using PdfPig.IO;
using Xunit;
public class CMapParserTests

View File

@@ -3,10 +3,10 @@
using System;
using System.IO;
using System.Linq;
using IO;
using PdfPig.Fonts.TrueType;
using PdfPig.Fonts.TrueType.Parser;
using PdfPig.Fonts.TrueType.Tables;
using PdfPig.IO;
using Xunit;
public class TrueTypeFontParserTests

View File

@@ -1,7 +1,7 @@
namespace UglyToad.PdfPig.Tests.Fonts.TrueType
{
using IO;
using PdfPig.Fonts.TrueType;
using PdfPig.IO;
using Xunit;
public class TrueTypeDataBytesTests

View File

@@ -2,9 +2,9 @@
{
using System.Collections.Generic;
using Content;
using IO;
using PdfPig.Fonts;
using PdfPig.Graphics;
using PdfPig.IO;
using PdfPig.Tokenization.Tokens;
internal class TestOperationContext : IOperationContext

View File

@@ -0,0 +1,70 @@
namespace UglyToad.PdfPig.Tests.IO
{
using System.IO;
using PdfPig.IO;
using PdfPig.Util;
using Xunit;
public class InputBytesTests
{
private const string TestData = @"123456789";
[Fact]
public void ArrayAndStreamBehaveTheSame()
{
var bytes = OtherEncodings.StringAsLatin1Bytes(TestData);
var array = new ByteArrayInputBytes(bytes);
using (var memoryStream = new MemoryStream(bytes))
{
var stream = new StreamInputBytes(memoryStream);
Assert.Equal(bytes.Length, array.Length);
Assert.Equal(bytes.Length, stream.Length);
Assert.Equal(0, array.CurrentOffset);
Assert.Equal(0, stream.CurrentOffset);
array.Seek(5);
stream.Seek(5);
Assert.Equal(array.CurrentOffset, stream.CurrentOffset);
Assert.Equal((byte)'5', array.CurrentByte);
Assert.Equal(array.CurrentByte, stream.CurrentByte);
Assert.Equal(array.Peek(), stream.Peek());
array.Seek(0);
stream.Seek(0);
Assert.Equal(0, array.CurrentByte);
Assert.Equal(array.CurrentByte, stream.CurrentByte);
array.Seek(7);
stream.Seek(7);
var arrayString = string.Empty;
var streamString = string.Empty;
while (array.MoveNext())
{
arrayString += (char) array.CurrentByte;
}
while (stream.MoveNext())
{
streamString += (char) stream.CurrentByte;
}
Assert.Equal("89", streamString);
Assert.Equal(arrayString, streamString);
Assert.True(stream.IsAtEnd());
Assert.True(array.IsAtEnd());
}
}
}
}

View File

@@ -80,7 +80,8 @@ namespace UglyToad.PdfPig.Tests.Integration
{
var file = GetFilename();
using (var document = PdfDocument.Open(File.ReadAllBytes(file)))
using (var stream = File.OpenRead(file))
using (var document = PdfDocument.Open(stream))
{
var page = document.GetPage(1);

View File

@@ -2,7 +2,7 @@
namespace UglyToad.PdfPig.Tests.Parser.Parts
{
using System;
using IO;
using PdfPig.IO;
using PdfPig.Parser.Parts;
using PdfPig.Util;
using Xunit;

View File

@@ -1,7 +1,7 @@
namespace UglyToad.PdfPig.Tests
{
using System.Text;
using IO;
using PdfPig.IO;
using PdfPig.Tokenization.Scanner;
using PdfPig.Util;

View File

@@ -3,7 +3,7 @@ namespace UglyToad.PdfPig.Tests.Tokenization.Scanner
{
using System;
using System.Collections.Generic;
using IO;
using PdfPig.IO;
using PdfPig.Tokenization.Scanner;
using PdfPig.Tokenization.Tokens;
using Xunit;

View File

@@ -3,8 +3,8 @@
using System;
using System.Collections.Generic;
using System.Text;
using IO;
using PdfPig.ContentStream;
using PdfPig.IO;
using PdfPig.Tokenization.Scanner;
using PdfPig.Tokenization.Tokens;
using PdfPig.Util;

View File

@@ -1,6 +1,6 @@
namespace UglyToad.PdfPig.Tests.Tokenization
{
using IO;
using PdfPig.IO;
using PdfPig.Tokenization;
using PdfPig.Tokenization.Tokens;
using Xunit;

View File

@@ -51,5 +51,9 @@
currentOffset = (int)position - 1;
CurrentByte = currentOffset < 0 ? (byte)0 : bytes[(int)currentOffset];
}
public void Dispose()
{
}
}
}

View File

@@ -1,6 +1,8 @@
namespace UglyToad.PdfPig.IO
{
internal interface IInputBytes
using System;
internal interface IInputBytes : IDisposable
{
long CurrentOffset { get; }

View File

@@ -0,0 +1,98 @@
namespace UglyToad.PdfPig.IO
{
using System;
using System.IO;
internal class StreamInputBytes : IInputBytes
{
private readonly Stream stream;
private readonly bool shouldDispose;
private bool isAtEnd;
public long CurrentOffset => stream.Position;
public byte CurrentByte { get; private set; }
public long Length => stream.Length;
public StreamInputBytes(Stream stream, bool shouldDispose = true)
{
if (stream == null)
{
throw new ArgumentNullException();
}
if (!stream.CanRead)
{
throw new ArgumentException("The provided stream did not support reading.");
}
if (!stream.CanSeek)
{
throw new ArgumentException("The provided stream did not support seeking.");
}
this.stream = stream;
this.shouldDispose = shouldDispose;
}
public bool MoveNext()
{
var b = stream.ReadByte();
if (b == -1)
{
isAtEnd = true;
CurrentByte = 0;
return false;
}
CurrentByte = (byte) b;
return true;
}
public byte? Peek()
{
var current = CurrentOffset;
var b = stream.ReadByte();
stream.Seek(current, SeekOrigin.Begin);
if (b == -1)
{
return null;
}
return (byte)b;
}
public bool IsAtEnd()
{
return isAtEnd;
}
public void Seek(long position)
{
if (position == 0)
{
stream.Seek(0, SeekOrigin.Begin);
CurrentByte = 0;
}
else
{
stream.Position = position - 1;
MoveNext();
}
}
public void Dispose()
{
if (shouldDispose)
{
stream?.Dispose();
}
}
}
}

View File

@@ -5,7 +5,6 @@
using System.Text;
using ContentStream;
using IO;
using Util;
using Util.JetBrains.Annotations;
/// <summary>
@@ -47,24 +46,48 @@
bool endobjFound = false;
do
{
bytes.Seek(currentOffset);
if (inObject)
{
if (ReadHelper.IsString(bytes, "endobj"))
if (bytes.CurrentByte == 'e')
{
inObject = false;
endobjFound = true;
currentOffset += "endobj".Length;
var next = bytes.Peek();
if (next.HasValue && next == 'n')
{
if (ReadHelper.IsString(bytes, "endobj"))
{
inObject = false;
endobjFound = true;
for (int i = 0; i < "endobj".Length; i++)
{
bytes.MoveNext();
currentOffset++;
}
}
else
{
bytes.MoveNext();
currentOffset++;
}
}
else
{
bytes.MoveNext();
currentOffset++;
}
}
else
{
bytes.MoveNext();
currentOffset++;
}
continue;
}
bytes.Seek(currentOffset);
if (!ReadHelper.IsString(bytes, " obj"))
{
currentOffset++;
@@ -114,6 +137,8 @@
endobjFound = false;
currentOffset++;
bytes.Seek(currentOffset);
} while (currentOffset < lastEndOfFile && !bytes.IsAtEnd());
if ((lastEndOfFile < long.MaxValue || endobjFound) && lastObjOffset > 0)

View File

@@ -26,17 +26,9 @@
{
public static PdfDocument Open(byte[] fileBytes, ParsingOptions options = null)
{
var container = Bootstrapper.GenerateContainer(options?.Logger);
var isLenientParsing = options?.UseLenientParsing ?? true;
var inputBytes = new ByteArrayInputBytes(fileBytes);
var tokenScanner = new CoreTokenScanner(inputBytes);
var document = OpenDocument(inputBytes, tokenScanner, container, isLenientParsing);
return document;
return Open(inputBytes, options);
}
public static PdfDocument Open(string filename, ParsingOptions options = null)
@@ -49,6 +41,26 @@
return Open(File.ReadAllBytes(filename), options);
}
internal static PdfDocument Open(Stream stream, ParsingOptions options)
{
var streamInput = new StreamInputBytes(stream, false);
return Open(streamInput, options);
}
private static PdfDocument Open(IInputBytes inputBytes, ParsingOptions options = null)
{
var container = Bootstrapper.GenerateContainer(options?.Logger);
var isLenientParsing = options?.UseLenientParsing ?? true;
var tokenScanner = new CoreTokenScanner(inputBytes);
var document = OpenDocument(inputBytes, tokenScanner, container, isLenientParsing);
return document;
}
private static PdfDocument OpenDocument(IInputBytes inputBytes, ISeekableTokenScanner scanner, IContainer container, bool isLenientParsing)
{
var log = container.Get<ILog>();
@@ -109,7 +121,7 @@
var caching = new ParsingCachingProviders(bruteForceSearcher, resourceContainer);
return new PdfDocument(log, version, crossReferenceTable, isLenientParsing, caching, pageFactory, catalog, information,
return new PdfDocument(log, inputBytes, version, crossReferenceTable, isLenientParsing, caching, pageFactory, catalog, information,
pdfScanner);
}

View File

@@ -1,6 +1,7 @@
namespace UglyToad.PdfPig
{
using System;
using System.IO;
using Content;
using Cos;
using IO;
@@ -15,12 +16,15 @@
/// </summary>
public class PdfDocument : IDisposable
{
private bool isDisposed = false;
[NotNull]
private readonly HeaderVersion version;
[NotNull]
private readonly CrossReferenceTable crossReferenceTable;
private readonly ILog log;
private readonly IInputBytes inputBytes;
private readonly bool isLenientParsing;
[NotNull]
private readonly ParsingCachingProviders cachingProviders;
@@ -49,7 +53,10 @@
/// </summary>
public int NumberOfPages => Pages.Count;
internal PdfDocument(ILog log, HeaderVersion version, CrossReferenceTable crossReferenceTable,
internal PdfDocument(ILog log,
IInputBytes inputBytes,
HeaderVersion version,
CrossReferenceTable crossReferenceTable,
bool isLenientParsing,
ParsingCachingProviders cachingProviders,
IPageFactory pageFactory,
@@ -57,6 +64,7 @@
DocumentInformation information, IPdfTokenScanner pdfScanner)
{
this.log = log;
this.inputBytes = inputBytes;
this.version = version ?? throw new ArgumentNullException(nameof(version));
this.crossReferenceTable = crossReferenceTable ?? throw new ArgumentNullException(nameof(crossReferenceTable));
this.isLenientParsing = isLenientParsing;
@@ -81,6 +89,17 @@
/// <param name="options">Optional parameters controlling parsing.</param>
/// <returns>A <see cref="PdfDocument"/> providing access to the file contents.</returns>
public static PdfDocument Open(string filePath, ParsingOptions options = null) => PdfDocumentFactory.Open(filePath, options);
/// <summary>
/// Creates a <see cref="PdfDocument"/> for reading from the provided stream.
/// The caller must manage disposing the stream. The created PdfDocument will not dispose the stream.
/// </summary>
/// <param name="stream">
/// A stream of the file contents, this must support reading and seeking.
/// The PdfDocument will not dispose of the provided stream.
/// </param>
/// <param name="options">Optional parameters controlling parsing.</param>
/// <returns>A <see cref="PdfDocument"/> providing access to the file contents.</returns>
public static PdfDocument Open(Stream stream, ParsingOptions options = null) => PdfDocumentFactory.Open(stream, options);
/// <summary>
/// Get the page with the specified page number.
@@ -89,6 +108,13 @@
/// <returns>The page.</returns>
public Page GetPage(int pageNumber)
{
if (isDisposed)
{
throw new ObjectDisposedException("Cannot access page after the document is disposed.");
}
log.Debug($"Accessing page {pageNumber}.");
return Pages.GetPage(pageNumber);
}
@@ -100,10 +126,15 @@
{
try
{
inputBytes.Dispose();
}
catch
catch (Exception ex)
{
// TODO: something
log.Error("Failed disposing the PdfDocument due to an error.", ex);
}
finally
{
isDisposed = true;
}
}
}