make tokenizer classes internal and change the file header to use a scanner rather than the pdfbox type reader

This commit is contained in:
Eliot Jones
2018-01-03 20:15:25 +00:00
parent 72ffa1f308
commit f09ef85e5a
18 changed files with 248 additions and 131 deletions

View File

@@ -0,0 +1,135 @@
namespace UglyToad.Pdf.Tests.Parser.Parts
{
using System;
using Exceptions;
using Pdf.Parser.Parts;
using Xunit;
public class FileHeaderParserTests
{
private readonly FileHeaderParser parser = new FileHeaderParser(new TestingLog());
[Fact]
public void NullScannerThrows()
{
Action action = () => parser.Parse(null, false);
Assert.Throws<ArgumentNullException>(action);
}
[Theory]
[InlineData("PDF-1.0")]
[InlineData("PDF-1.1")]
[InlineData("PDF-1.7")]
[InlineData("PDF-1.9")]
[InlineData("FDF-1.0")]
[InlineData("FDF-1.9")]
public void ReadsConformingHeader(string format)
{
var input = $"%{format}\nany garbage";
var scanner = StringBytesTestConverter.Scanner(input);
var result = parser.Parse(scanner, false);
Assert.Equal(format, result.VersionString);
}
[Fact]
public void ReadsHeaderWithBlankSpaceBefore()
{
const string input = @"
%PDF-1.2";
var scanner = StringBytesTestConverter.Scanner(input);
var result = parser.Parse(scanner, false);
Assert.Equal(1.2m, result.Version);
}
[Fact]
public void EmptyInputThrows()
{
var scanner = StringBytesTestConverter.Scanner(string.Empty);
Action action = () => parser.Parse(scanner, false);
Assert.Throws<PdfDocumentFormatException>(action);
}
[Fact]
public void HeaderPrecededByJunkNonLenientThrows()
{
var scanner = StringBytesTestConverter.Scanner(@"one
%PDF-1.2");
Action action = () => parser.Parse(scanner, false);
Assert.Throws<PdfDocumentFormatException>(action);
}
[Fact]
public void HeaderPrecededByJunkLenientReads()
{
var scanner = StringBytesTestConverter.Scanner(@"one
%PDF-1.7");
var result = parser.Parse(scanner, true);
Assert.Equal(1.7m, result.Version);
}
[Fact]
public void HeaderPrecededByTooMuchJunkThrows()
{
var scanner = StringBytesTestConverter.Scanner(@"one two
three %PDF-1.6");
Action action = () => parser.Parse(scanner, true);
Assert.Throws<PdfDocumentFormatException>(action);
}
[Fact]
public void JunkThenEndThrows()
{
var scanner = StringBytesTestConverter.Scanner(@"one two");
Action action = () => parser.Parse(scanner, true);
Assert.Throws<PdfDocumentFormatException>(action);
}
[Fact]
public void VersionFormatInvalidNotLenientThrows()
{
var scanner = StringBytesTestConverter.Scanner("%Pdeef-1.69");
Action action = () => parser.Parse(scanner, false);
Assert.Throws<PdfDocumentFormatException>(action);
}
[Fact]
public void VersionFormatInvalidLenientDefaults1Point4()
{
var scanner = StringBytesTestConverter.Scanner("%Pdeef-1.69");
var result = parser.Parse(scanner, true);
Assert.Equal(1.4m, result.Version);
}
[Fact]
public void ParsingResetsPosition()
{
var scanner = StringBytesTestConverter.Scanner(@"%FDF-1.6");
parser.Parse(scanner, false);
Assert.Equal(0, scanner.CurrentPosition);
}
}
}

View File

@@ -2,6 +2,8 @@
{ {
using System.Text; using System.Text;
using IO; using IO;
using Pdf.Tokenization.Scanner;
using Pdf.Util;
public static class StringBytesTestConverter public static class StringBytesTestConverter
{ {
@@ -29,5 +31,12 @@
public IInputBytes Bytes { get; set; } public IInputBytes Bytes { get; set; }
} }
internal static CoreTokenScanner Scanner(string s)
{
var result = new CoreTokenScanner(new ByteArrayInputBytes(OtherEncodings.StringAsLatin1Bytes(s)));
return result;
}
} }
} }

View File

@@ -9,20 +9,21 @@
public ByteArrayInputBytes(IReadOnlyList<byte> bytes) public ByteArrayInputBytes(IReadOnlyList<byte> bytes)
{ {
this.bytes = bytes; this.bytes = bytes;
CurrentOffset = -1; currentOffset = -1;
} }
public int CurrentOffset { get; private set; } private int currentOffset;
public int CurrentOffset => currentOffset + 1;
public bool MoveNext() public bool MoveNext()
{ {
if (CurrentOffset == bytes.Count - 1) if (currentOffset == bytes.Count - 1)
{ {
return false; return false;
} }
CurrentOffset++; currentOffset++;
CurrentByte = bytes[CurrentOffset]; CurrentByte = bytes[currentOffset];
return true; return true;
} }
@@ -30,23 +31,23 @@
public byte? Peek() public byte? Peek()
{ {
if (CurrentOffset == bytes.Count - 1) if (currentOffset == bytes.Count - 1)
{ {
return null; return null;
} }
return bytes[CurrentOffset + 1]; return bytes[currentOffset + 1];
} }
public bool IsAtEnd() public bool IsAtEnd()
{ {
return CurrentOffset == bytes.Count - 1; return currentOffset == bytes.Count - 1;
} }
public void Seek(long position) public void Seek(long position)
{ {
CurrentOffset = (int)position - 1; currentOffset = (int)position - 1;
CurrentByte = CurrentOffset < 0 ? (byte)0 : bytes[CurrentOffset]; CurrentByte = currentOffset < 0 ? (byte)0 : bytes[currentOffset];
} }
} }
} }

View File

@@ -2,9 +2,10 @@
{ {
using System; using System;
using System.Text.RegularExpressions; using System.Text.RegularExpressions;
using IO; using Exceptions;
using Logging; using Logging;
using Util; using Tokenization.Scanner;
using Tokenization.Tokens;
using Util.JetBrains.Annotations; using Util.JetBrains.Annotations;
/// <summary> /// <summary>
@@ -21,14 +22,12 @@
/// %PDF1.5 /// %PDF1.5
/// %PDF1.6 /// %PDF1.6
/// %PDF1.7 /// %PDF1.7
/// This parser allows versions up to 1.9.
/// For versions equal or greater to PDF 1.4, the optional Version entry in the documents catalog dictionary should be used instead of the header version. /// For versions equal or greater to PDF 1.4, the optional Version entry in the documents catalog dictionary should be used instead of the header version.
/// </remarks> /// </remarks>
public class FileHeaderParser internal class FileHeaderParser
{ {
private const string PdfHeader = "%PDF-"; private static readonly Regex VersionRegex = new Regex(@"[FP]DF-(?<version>1.\d)", RegexOptions.IgnoreCase);
private const string FdfHeader = "%FDF-";
private const string PdfDefaultVersion = "1.4";
private const string FdfDefaultVersion = "1.0";
private readonly ILog log; private readonly ILog log;
@@ -38,115 +37,59 @@
} }
[NotNull] [NotNull]
public HeaderVersion ReadHeader([NotNull]IRandomAccessRead reader, bool isLenientParsing) public HeaderVersion Parse([NotNull]ISeekableTokenScanner scanner, bool isLenientParsing)
{ {
if (reader == null) if (scanner == null)
{ {
throw new ArgumentNullException(nameof(reader)); throw new ArgumentNullException(nameof(scanner));
} }
if (TryFindHeader(PdfHeader, PdfDefaultVersion, reader, isLenientParsing, out var version)) // Read the first token
if (!scanner.MoveNext())
{ {
return version; throw new PdfDocumentFormatException($"Could not read the first token in the document at position {scanner.CurrentPosition}.");
} }
if (TryFindHeader(FdfHeader, FdfDefaultVersion, reader, isLenientParsing, out version)) var comment = scanner.CurrentToken as CommentToken;
var junkSkip = isLenientParsing ? 2 : 0;
var attempts = 0;
while (comment == null)
{ {
return version; if (attempts == junkSkip)
{
throw new PdfDocumentFormatException("Could not find the version header comment at the start of the document.");
} }
throw new FormatException("The pdf or fdf document did not seem to contain a version header."); if (!scanner.MoveNext())
{
throw new PdfDocumentFormatException("Could not find the version header comment at the start of the document.");
} }
private bool TryFindHeader(string marker, string defaultVersion, IRandomAccessRead reader, bool isLenientParsing, out HeaderVersion version) comment = scanner.CurrentToken as CommentToken;
{
version = null;
// Read the first line attempts++;
var currentLine = ReadHelper.ReadLine(reader);
if (!currentLine.Contains(marker))
{
// Move to the next line
currentLine = ReadHelper.ReadLine(reader);
while (!currentLine.Contains(marker))
{
var startsWithDigit = currentLine.Length > 0 && char.IsDigit(currentLine[0]);
// if a line starts with a digit, it has to be the first one with data in it
if (startsWithDigit)
{
break;
} }
currentLine = ReadHelper.ReadLine(reader); var match = VersionRegex.Match(comment.Data);
}
}
if (!currentLine.Contains(marker)) if (!match.Success || !decimal.TryParse(match.Groups["version"].Value, out decimal version))
{
reader.ReturnToBeginning();
return false;
}
var headerStartIndex = currentLine.IndexOf(marker, StringComparison.OrdinalIgnoreCase);
if (headerStartIndex > 0)
{
//trim off any leading characters
currentLine = currentLine.Substring(headerStartIndex);
}
var regex = new Regex($"{marker}\\d.\\d");
if (currentLine.StartsWith(marker) && !regex.IsMatch(currentLine))
{
if (currentLine.Length < marker.Length + 3)
{
// No version number at all, set to 1.4 as default
currentLine = marker + defaultVersion;
//LOG.debug("No version found, set to " + defaultVersion + " as default.");
}
else
{
var headerGarbage = currentLine.Substring(marker.Length + 3) + "\n";
currentLine = currentLine.Substring(0, marker.Length + 3);
reader.Rewind(OtherEncodings.StringAsLatin1Bytes(headerGarbage).Length);
}
}
decimal headerVersion = -1;
try
{
var headerParts = currentLine.Split('-');
if (headerParts.Length == 2)
{
headerVersion = decimal.Parse(headerParts[1]);
}
}
catch (FormatException ex)
{
log?.Debug("Can't parse the header version: " + currentLine, ex);
}
if (headerVersion < 0)
{ {
if (isLenientParsing) if (isLenientParsing)
{ {
headerVersion = 1.7m; log.Warn($"Did not find a version header of the correct format, defaulting to 1.4 since lenient. Header was: {comment.Data}.");
}
else return new HeaderVersion(1.4m, "PDF-1.4");
{
throw new InvalidOperationException("Error getting header version: " + currentLine);
}
} }
reader.ReturnToBeginning(); throw new PdfDocumentFormatException($"The comment which should have provided the version was in the wrong format: {comment.Data}.");
version = new HeaderVersion(headerVersion, currentLine); }
return true; scanner.Seek(0);
var result = new HeaderVersion(version, comment.Data);
return result;
} }
} }
} }

View File

@@ -1,6 +1,6 @@
namespace UglyToad.Pdf.Parser.Parts namespace UglyToad.Pdf.Parser.Parts
{ {
public class HeaderVersion internal class HeaderVersion
{ {
public decimal Version { get; } public decimal Version { get; }
@@ -11,5 +11,10 @@
Version = version; Version = version;
VersionString = versionString; VersionString = versionString;
} }
public override string ToString()
{
return $"Version: {VersionString}";
}
} }
} }

View File

@@ -16,6 +16,7 @@
using Logging; using Logging;
using Parts; using Parts;
using Parts.CrossReference; using Parts.CrossReference;
using Tokenization.Scanner;
using Util; using Util;
internal static class PdfDocumentFactory internal static class PdfDocumentFactory
@@ -28,7 +29,9 @@
var reader = new RandomAccessBuffer(fileBytes); var reader = new RandomAccessBuffer(fileBytes);
var document = OpenDocument(reader, container, isLenientParsing); var tokenScanner = new CoreTokenScanner(new ByteArrayInputBytes(fileBytes));
var document = OpenDocument(reader,tokenScanner, container, isLenientParsing);
return document; return document;
} }
@@ -43,11 +46,11 @@
return Open(File.ReadAllBytes(filename), options); return Open(File.ReadAllBytes(filename), options);
} }
private static PdfDocument OpenDocument(IRandomAccessRead reader, IContainer container, bool isLenientParsing) private static PdfDocument OpenDocument(IRandomAccessRead reader, ISeekableTokenScanner scanner, IContainer container, bool isLenientParsing)
{ {
var log = container.Get<ILog>(); var log = container.Get<ILog>();
var version = container.Get<FileHeaderParser>().ReadHeader(reader, isLenientParsing); var version = container.Get<FileHeaderParser>().Parse(scanner, isLenientParsing);
var crossReferenceOffset = container.Get<FileTrailerParser>().GetXrefOffset(reader, isLenientParsing); var crossReferenceOffset = container.Get<FileTrailerParser>().GetXrefOffset(reader, isLenientParsing);

View File

@@ -32,6 +32,11 @@
[NotNull] [NotNull]
public DocumentInformation Information { get; } public DocumentInformation Information { get; }
/// <summary>
/// The version number of the PDF specification which this file conforms to, for example 1.4.
/// </summary>
public decimal Version => version.Version;
/// <summary> /// <summary>
/// Get the number of pages in this document. /// Get the number of pages in this document.
/// </summary> /// </summary>

View File

@@ -5,7 +5,7 @@
using Scanner; using Scanner;
using Tokens; using Tokens;
public class ArrayTokenizer : ITokenizer internal class ArrayTokenizer : ITokenizer
{ {
public bool ReadsNextByte { get; } = false; public bool ReadsNextByte { get; } = false;

View File

@@ -5,7 +5,7 @@
using Parser.Parts; using Parser.Parts;
using Tokens; using Tokens;
public class CommentTokenizer : ITokenizer internal class CommentTokenizer : ITokenizer
{ {
public bool ReadsNextByte { get; } = true; public bool ReadsNextByte { get; } = true;

View File

@@ -8,7 +8,7 @@
using Tokens; using Tokens;
using Util.JetBrains.Annotations; using Util.JetBrains.Annotations;
public class DictionaryTokenizer : ITokenizer internal class DictionaryTokenizer : ITokenizer
{ {
public bool ReadsNextByte { get; } = false; public bool ReadsNextByte { get; } = false;

View File

@@ -5,7 +5,7 @@
using Parser.Parts; using Parser.Parts;
using Tokens; using Tokens;
public class HexTokenizer : ITokenizer internal class HexTokenizer : ITokenizer
{ {
public bool ReadsNextByte { get; } = false; public bool ReadsNextByte { get; } = false;

View File

@@ -7,7 +7,7 @@
using Parser.Parts; using Parser.Parts;
using Tokens; using Tokens;
public class NameTokenizer : ITokenizer internal class NameTokenizer : ITokenizer
{ {
public bool ReadsNextByte { get; } = true; public bool ReadsNextByte { get; } = true;

View File

@@ -6,7 +6,7 @@
using IO; using IO;
using Tokens; using Tokens;
public class NumericTokenizer : ITokenizer internal class NumericTokenizer : ITokenizer
{ {
public bool ReadsNextByte { get; } = true; public bool ReadsNextByte { get; } = true;

View File

@@ -5,7 +5,7 @@
using Parser.Parts; using Parser.Parts;
using Tokens; using Tokens;
public class PlainTokenizer : ITokenizer internal class PlainTokenizer : ITokenizer
{ {
public bool ReadsNextByte { get; } = true; public bool ReadsNextByte { get; } = true;

View File

@@ -7,14 +7,7 @@
using Tokenization; using Tokenization;
using Tokens; using Tokens;
internal enum ScannerScope internal class CoreTokenScanner : ISeekableTokenScanner
{
None,
Array,
Dictionary
}
public class CoreTokenScanner : ITokenScanner
{ {
private static readonly HexTokenizer HexTokenizer = new HexTokenizer(); private static readonly HexTokenizer HexTokenizer = new HexTokenizer();
private static readonly StringTokenizer StringTokenizer = new StringTokenizer(); private static readonly StringTokenizer StringTokenizer = new StringTokenizer();
@@ -48,6 +41,13 @@
return false; return false;
} }
public void Seek(long position)
{
inputBytes.Seek(position);
}
public long CurrentPosition => inputBytes.CurrentOffset;
private bool hasBytePreRead; private bool hasBytePreRead;
internal CoreTokenScanner(IInputBytes inputBytes, ScannerScope scope = ScannerScope.None) internal CoreTokenScanner(IInputBytes inputBytes, ScannerScope scope = ScannerScope.None)

View File

@@ -2,7 +2,7 @@
{ {
using Tokens; using Tokens;
public interface ITokenScanner internal interface ITokenScanner
{ {
bool MoveNext(); bool MoveNext();
@@ -10,4 +10,11 @@
bool TryReadToken<T>(out T token) where T : class, IToken; bool TryReadToken<T>(out T token) where T : class, IToken;
} }
internal interface ISeekableTokenScanner : ITokenScanner
{
void Seek(long position);
long CurrentPosition { get; }
}
} }

View File

@@ -0,0 +1,9 @@
namespace UglyToad.Pdf.Tokenization.Scanner
{
internal enum ScannerScope
{
None,
Array,
Dictionary
}
}

View File

@@ -6,7 +6,7 @@
using Tokens; using Tokens;
using Util; using Util;
public class StringTokenizer : ITokenizer internal class StringTokenizer : ITokenizer
{ {
public bool ReadsNextByte { get; } = false; public bool ReadsNextByte { get; } = false;