diff --git a/src/UglyToad.Pdf.Tests/Integration/MultiplePageMortalityStatisticsTests.cs b/src/UglyToad.Pdf.Tests/Integration/MultiplePageMortalityStatisticsTests.cs index d47a6983..8b52aa43 100644 --- a/src/UglyToad.Pdf.Tests/Integration/MultiplePageMortalityStatisticsTests.cs +++ b/src/UglyToad.Pdf.Tests/Integration/MultiplePageMortalityStatisticsTests.cs @@ -2,6 +2,7 @@ { using System; using System.IO; + using Content; using Xunit; public class MultiplePageMortalityStatisticsTests @@ -43,6 +44,30 @@ Assert.Contains("Mortality Statistics: Metadata", page.Text); Assert.Contains("Notification to the registrar by the coroner that he does not consider it necessary to hold an inquest – no post-mortem held (Form 100A – salmon pink)", page.Text); Assert.Contains("Presumption of death certificate", page.Text); + + Assert.Equal(PageSize.Letter, page.Size); + } + } + + [Fact] + public void GetsPagesContent() + { + using (var document = PdfDocument.Open(GetFilename())) + { + var pages = new[] + { + document.GetPage(1), + document.GetPage(2), + document.GetPage(3), + document.GetPage(4), + document.GetPage(5), + document.GetPage(6) + }; + + Assert.Contains(@"Up to 1992, publications gave numbers of deaths registered in the period concerned. From 1993 to 2005, the figures in annual reference volumes relate to the number of deaths that " ++ "occurred in the reference period. From 2006 onwards, all tables in Series DR are based on " ++ "deaths registered in a calendar period. More details on these changes can be found in the " ++ "publication Mortality Statistics: Deaths Registered in 2006 (ONS, 2008)", pages[5].Text); } } } diff --git a/src/UglyToad.Pdf.Tests/Tokenization/EndOfLineTokenizerTests.cs b/src/UglyToad.Pdf.Tests/Tokenization/EndOfLineTokenizerTests.cs new file mode 100644 index 00000000..cea09af1 --- /dev/null +++ b/src/UglyToad.Pdf.Tests/Tokenization/EndOfLineTokenizerTests.cs @@ -0,0 +1,45 @@ +namespace UglyToad.Pdf.Tests.Tokenization +{ + using Pdf.Tokenization; + using Pdf.Tokenization.Tokens; + using Xunit; + + public class EndOfLineTokenizerTests + { + private readonly EndOfLineTokenizer tokenizer = new EndOfLineTokenizer(); + + [Fact] + public void CurrentByteIsNotEndOfLineFalse() + { + var input = StringBytesTestConverter.Convert("\r something \n", false); + + var result = tokenizer.TryTokenize((byte)'\0', input.Bytes, out var _); + + Assert.False(result); + } + + [Fact] + public void CurrentByteIsCarriageReturnTrue() + { + var input = StringBytesTestConverter.Convert("\r", false); + + var result = tokenizer.TryTokenize((byte)'\r', input.Bytes, out var token); + + Assert.True(result); + + Assert.Same(EndOfLineToken.Token, token); + } + + [Fact] + public void CurrentByteIsEndOfLineTrue() + { + var input = StringBytesTestConverter.Convert("\n", false); + + var result = tokenizer.TryTokenize((byte)'\n', input.Bytes, out var token); + + Assert.True(result); + + Assert.Same(EndOfLineToken.Token, token); + } + } +} diff --git a/src/UglyToad.Pdf.Tests/Tokenization/StringTokenizerTests.cs b/src/UglyToad.Pdf.Tests/Tokenization/StringTokenizerTests.cs index 2c532fad..ae518e90 100644 --- a/src/UglyToad.Pdf.Tests/Tokenization/StringTokenizerTests.cs +++ b/src/UglyToad.Pdf.Tests/Tokenization/StringTokenizerTests.cs @@ -188,6 +188,65 @@ are the same.)"; Assert.Equal("This string has two +Öctals", AssertStringToken(token).Data); } + [Fact] + public void HandlesEscapedBackslash() + { + const string s = @"(listen\\learn)"; + + var input = StringBytesTestConverter.Convert(s); + + var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token); + + Assert.True(result); + + Assert.Equal(@"listen\learn", AssertStringToken(token).Data); + } + + [Theory] + [InlineData(@"(new line \n)", "new line \n")] + [InlineData(@"(carriage return \r)", "carriage return \r")] + [InlineData(@"(tab \t)", "tab \t")] + [InlineData(@"(bell \b)", "bell \b")] + [InlineData(@"(uhmmm \f)", "uhmmm \f")] + public void WritesEscapedCharactersToOutput(string input, string expected) + { + var bytes = StringBytesTestConverter.Convert(input); + + var result = tokenizer.TryTokenize(bytes.First, bytes.Bytes, out var token); + + Assert.True(result); + + Assert.Equal(expected, AssertStringToken(token).Data); + } + + [Fact] + public void EscapedNonEscapeCharacterWritesPlainCharacter() + { + const string s = @"(this does not need escaping \e)"; + + var input = StringBytesTestConverter.Convert(s); + + var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token); + + Assert.True(result); + + Assert.Equal(@"this does not need escaping e", AssertStringToken(token).Data); + } + + [Fact] + public void ReachesEndOfInputAssumesEndOfString() + { + const string s = @"(this does not end with bracket"; + + var input = StringBytesTestConverter.Convert(s); + + var result = tokenizer.TryTokenize(input.First, input.Bytes, out var token); + + Assert.True(result); + + Assert.Equal(@"this does not end with bracket", AssertStringToken(token).Data); + } + private static StringToken AssertStringToken(IToken token) { Assert.NotNull(token); diff --git a/src/UglyToad.Pdf/Parser/FileStructure/CrossReferenceTableParser.cs b/src/UglyToad.Pdf/Parser/FileStructure/CrossReferenceTableParser.cs index bd40c27a..daf951d1 100644 --- a/src/UglyToad.Pdf/Parser/FileStructure/CrossReferenceTableParser.cs +++ b/src/UglyToad.Pdf/Parser/FileStructure/CrossReferenceTableParser.cs @@ -51,7 +51,7 @@ var definition = new TableSubsectionDefinition(firstObjectNumber.Long, objectCount.Int); - var tokenizer = new CrossReferenceEndOfLineTokenizer(); + var tokenizer = new EndOfLineTokenizer(); scanner.RegisterCustomTokenizer((byte)'\r', tokenizer); scanner.RegisterCustomTokenizer((byte)'\n', tokenizer); diff --git a/src/UglyToad.Pdf/Tokenization/CrossReferenceEndOfLineTokenizer.cs b/src/UglyToad.Pdf/Tokenization/EndOfLineTokenizer.cs similarity index 64% rename from src/UglyToad.Pdf/Tokenization/CrossReferenceEndOfLineTokenizer.cs rename to src/UglyToad.Pdf/Tokenization/EndOfLineTokenizer.cs index c55a2aa4..aa871485 100644 --- a/src/UglyToad.Pdf/Tokenization/CrossReferenceEndOfLineTokenizer.cs +++ b/src/UglyToad.Pdf/Tokenization/EndOfLineTokenizer.cs @@ -3,7 +3,7 @@ using IO; using Tokens; - internal class CrossReferenceEndOfLineTokenizer : ITokenizer + internal class EndOfLineTokenizer : ITokenizer { public bool ReadsNextByte { get; } = false; @@ -20,13 +20,4 @@ return true; } } - - internal class EndOfLineToken : IToken - { - public static EndOfLineToken Token { get; } = new EndOfLineToken(); - - private EndOfLineToken() - { - } - } } diff --git a/src/UglyToad.Pdf/Tokenization/StringTokenizer.cs b/src/UglyToad.Pdf/Tokenization/StringTokenizer.cs index a8c66a14..272732b3 100644 --- a/src/UglyToad.Pdf/Tokenization/StringTokenizer.cs +++ b/src/UglyToad.Pdf/Tokenization/StringTokenizer.cs @@ -222,11 +222,6 @@ isOctalActive = true; octalsRead = 1; break; - case ')': - // TODO: Handle the weird malformed use case "/Something (C:\)" - // numberOfBrackets = CheckForEndOfString(inputBytes, numberOfBrackets); - builder.Append(c); - break; default: if (c == ReadHelper.AsciiCarriageReturn || c == ReadHelper.AsciiLineFeed) { diff --git a/src/UglyToad.Pdf/Tokenization/Tokens/EndOfLineToken.cs b/src/UglyToad.Pdf/Tokenization/Tokens/EndOfLineToken.cs new file mode 100644 index 00000000..93e17076 --- /dev/null +++ b/src/UglyToad.Pdf/Tokenization/Tokens/EndOfLineToken.cs @@ -0,0 +1,11 @@ +namespace UglyToad.Pdf.Tokenization.Tokens +{ + internal class EndOfLineToken : IToken + { + public static EndOfLineToken Token { get; } = new EndOfLineToken(); + + private EndOfLineToken() + { + } + } +} \ No newline at end of file