Files
PdfPig/src/UglyToad.PdfPig.Tests/Tokenization/Scanner/PdfTokenScannerTests.cs
2024-01-10 20:01:51 +00:00

525 lines
18 KiB
C#
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
namespace UglyToad.PdfPig.Tests.Tokenization.Scanner
{
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using PdfPig.Core;
using PdfPig.Encryption;
using PdfPig.Tokenization.Scanner;
using PdfPig.Tokens;
using Xunit;
public class PdfTokenScannerTests
{
[Fact]
public void ReadsSimpleObject()
{
const string s = @"294 0 obj
/WDKAAR+CMBX12
endobj";
var pdfScanner = GetScanner(s);
pdfScanner.MoveNext();
var objectToken = Assert.IsType<ObjectToken>(pdfScanner.CurrentToken);
var name = Assert.IsType<NameToken>(objectToken.Data);
Assert.Equal(294, objectToken.Number.ObjectNumber);
Assert.Equal(0, objectToken.Number.Generation);
Assert.Equal("WDKAAR+CMBX12", name.Data);
Assert.StartsWith("294 0 obj", s.Substring((int)objectToken.Position));
}
[Fact]
public void ReadsIndirectReferenceInObject()
{
const string s = @"
15 0 obj
12 7 R
endobj";
var scanner = GetScanner(s);
var token = ReadToEnd(scanner)[0];
var reference = Assert.IsType<IndirectReferenceToken>(token.Data);
Assert.Equal(new IndirectReference(12, 7), reference.Data);
}
[Fact]
public void ReadsObjectWithUndefinedIndirectReference()
{
const string s = @"
5 0 obj
<<
/XObject <<
/Pic1 7 0 R
>>
/ProcSet [/PDF /Text /ImageC ]
/Font <<
/F0 8 0 R
/F1 9 0 R
/F2 10 0 R
/F3 0 0 R
>>
>>
endobj";
var scanner = GetScanner(s);
ReadToEnd(scanner);
var token = scanner.Get(new IndirectReference(5, 0));
Assert.NotNull(token);
token = scanner.Get(new IndirectReference(0, 0));
Assert.Null(token);
}
[Fact]
public void ReadsNumericObjectWithComment()
{
const string s = @"%PDF-1.2
% I commented here too, tee hee
10383384 2 obj
%and here, I just love comments
45
endobj
%%EOF";
var pdfScanner = GetScanner(s);
pdfScanner.MoveNext();
var obj = Assert.IsType<ObjectToken>(pdfScanner.CurrentToken);
var num = Assert.IsType<NumericToken>(obj.Data);
Assert.Equal(45, num.Int);
Assert.Equal(10383384, obj.Number.ObjectNumber);
Assert.Equal(2, obj.Number.Generation);
Assert.StartsWith("10383384 2 obj", s.Substring((int)obj.Position));
Assert.False(pdfScanner.MoveNext());
}
[Fact]
public void ReadsArrayObject()
{
const string s = @"
endobj
295 0 obj
[
676 938 875 787 750 880 813 875 813 875 813 656 625 625 938 938 313
344 563 563 563 563 563 850 500 574 813 875 563 1019 1144 875 313
]
endobj";
var pdfScanner = GetScanner(s);
pdfScanner.MoveNext();
var obj = Assert.IsType<ObjectToken>(pdfScanner.CurrentToken);
var array = Assert.IsType<ArrayToken>(obj.Data);
Assert.Equal(676, ((NumericToken)array.Data[0]).Int);
Assert.Equal(33, array.Data.Count);
Assert.Equal(295, obj.Number.ObjectNumber);
Assert.Equal(0, obj.Number.Generation);
Assert.StartsWith("295 0 obj", s.Substring((int)obj.Position));
Assert.False(pdfScanner.MoveNext());
}
[Fact]
public void ReadsDictionaryObjectThenNameThenDictionary()
{
const string s = @"
274 0 obj
<<
/Type /Pages
/Count 2
/Parent 275 0 R
/Kids [ 121 0 R 125 0 R ]
>>
endobj
%Other parts...
310 0 obj
/WPXNWT+CMR9
endobj 311 0 obj
<<
/Type /Font
/Subtype /Type1
/FirstChar 0
/LastChar 127
/Widths 313 0 R
/BaseFont 310 0 R /FontDescriptor 312 0 R
>>
endobj";
var scanner = GetScanner(s);
var tokens = ReadToEnd(scanner);
var dictionary = Assert.IsType<DictionaryToken>(tokens[0].Data);
Assert.Equal(4, dictionary.Data.Count);
Assert.Equal(274, tokens[0].Number.ObjectNumber);
Assert.StartsWith("274 0 obj", s.Substring((int)tokens[0].Position));
var nameObject = Assert.IsType<NameToken>(tokens[1].Data);
Assert.Equal("WPXNWT+CMR9", nameObject.Data);
Assert.Equal(310, tokens[1].Number.ObjectNumber);
Assert.StartsWith("310 0 obj", s.Substring((int)tokens[1].Position));
dictionary = Assert.IsType<DictionaryToken>(tokens[2].Data);
Assert.Equal(7, dictionary.Data.Count);
Assert.Equal(311, tokens[2].Number.ObjectNumber);
Assert.StartsWith("311 0 obj", s.Substring((int)tokens[2].Position));
}
[Fact]
public void ReadsStringObject()
{
const string s = @"
58949797283757 0 obj (An object begins with obj and ends with endobj...) endobj
";
var scanner = GetScanner(s);
var token = ReadToEnd(scanner)[0];
Assert.Equal(58949797283757L, token.Number.ObjectNumber);
Assert.Equal("An object begins with obj and ends with endobj...", Assert.IsType<StringToken>(token.Data).Data);
Assert.StartsWith("58949797283757 0 obj", s.Substring((int)token.Position));
}
[Fact]
public void ReadsStreamObject()
{
const string s = @"
352 0 obj
<< /S 1273 /Filter /FlateDecode /Length 353 0 R >>
stream
H‰œUkLSgþÚh¹IÝÅlK(%[ÈÅ©+ ƒåꩊèæÇtnZ)Z¹¨Oå~9ŠÊµo”[éiK)÷B¹´
É² ©¸˜ n±º×dKöcÏ÷ãœç{ßï}¾÷ÍÉs  Ô;€
À»—ÀF`ÇF@ƒ 4 ˜ï @¥T¨³fY: žµ;Îq®]cƒÿdp¨ÛI3F#G©#œ)TÇqW£NÚѬgOKbü‡µ#á¡£Þaîtƒƒß
¾“S>}µuÕõ5M±¢ª†»øÞû•q÷îÜ~¬PòžÞ~•¬ëɃGÅ-Ñ­ím·°gêêb,/,£P§õ^ ãÁô¿¿ŠTE]²±{šuwÔ`LG³DªìTÈ
A¡¬àð‰É©ˆ°¼×s³®í»š}%§X{{tøNåÝž¶ö¢ÖÞ¾~´¼¬°À“Éððr¥8»P£ØêÁi½®Û(éhŽú;x#dÃÄ$m
+)
)†…±n
9ùyŽA·n\ï»t!=3£½¡:®­µåâ¹Ô³ø¼ËiûSÎsë;•Dt—ö$WÉ4U¢ºÚšñá1íÐèÔó‚svõ(/(+D²#mZÏ6êüÝ7x‡—†”‡E„²|ê«êªDµ5q°šR¦RÈ£ n¾[è~“}ýƒÝ½SꞦ'æQŽ
óF+Õ%ù‡ƒß9SˆŒÓãšH¶~L-#T]êîÁ©ÎkbjÒp½¸$¤´(4<,""øfvΕ< VЍ«#4'2l'Ð1ñðn?sìûãI'OŸøñçŸN5(äÊ'âÎѾÞþíðƒQmu}]Õ£‡c©.Œòµ9zz0ѲB¢«#š-3ªà<cš¥¡È¨qµ¦{pìÛ„Ã‡ŽŠ/íO»|áIclSCuo_Oœ\\ï!ª©«­ªƒTþ5әܔóî_9|ýÍ7ø!Ñý|2Goÿ€Î¶Öö…<ðáƒGéGá½G´Ã.®TŠóî=_|þ™‡ƒééFwßà 0æîc_Ó릳|ý|¶®æ„…†G8Òüï€l…\¦RFº:‰ VP𕐝S“Û¶ï V—ø/¿¾Xæ+«««ÖŽ4>ŸŸ¦Pà8®Ó…¼æ¢BaÅÐkëÊŠukÈÊÖL£­ivvv…k2=µZMØ|Úl(ŠZ­VÍbI>Ÿl¹œ(â±Äb­ø”Uª ñeü©U*“Oð,„E+¶Êà>ŽU”ÎÌõçlºFÃ_ÃÙl?¶=>>!>þC¿-×à©©©x¾€¢ŠÊåòtÃ0Æôz“‰ NÊ,¬‚kÀ°FXÛ4&“ÉfÃñÅæûæy=ÆãIðE _¾Èårår/XÞ„/·qòìÖ|†óx8Wð¹hºÜÂÕalÎü˜Ã0^Òòòü¼yÞ¶´´DX
ÇM8lüM…Oúý| 1Ïãk»:t<…ÂÚl¶e¾†” éKÜl6c¹¸É„ ”)‰'3¤œ\™ËN™ÿe^Ё² y÷ð¹f`3ëž´ ¸“$d:e†)!%2ºdvË@½N¼ªŠ Ùná¹ ¼¿@ €Ã.èšs ì÷ûM€2(E4_ | FÑ.@v@÷¤ÃÅ0È Pž~,€:»H¤ hT Œ € êÇV:Ô…©@@oH¯(3T‰{""C½SñŠœþtz3€•ƒ ñf.¬SЍøzWþ*$9gj=~Ì·QD E6o¥Ûi/Â`1ígGMq,;}޼sÔ×®kDü˜J{e5²ìɐ~Y)}fA>:˜ù–""Yò ç¹=ù²yÛ¡¿iØÏºþÇoäO ôkÆ)
endstream
endobj
353 0 obj
1479
endobj";
var locationProvider = new TestObjectLocationProvider();
// Mark location of "353 0 obj"
locationProvider.Offsets[new IndirectReference(353, 0)] = 1643;
var scanner = GetScanner(s, locationProvider);
var tokens = ReadToEnd(scanner);
Assert.Equal(2, tokens.Count);
var stream = Assert.IsType<StreamToken>(tokens[0].Data);
var str = Encoding.UTF8.GetString(stream.Data.ToArray());
Assert.StartsWith("H‰œUkLSgþÚh¹IÝÅl", str);
Assert.Equal(2, locationProvider.Offsets[new IndirectReference(352, 0)]);
}
[Fact]
public void ReadsSimpleStreamObject()
{
// Length of the bytes as found by Encoding.UTF8.GetBytes is 45
const string s = @"
574387 0 obj
<< /Length 45 >>
stream
À“Éððr¥8»P£ØêÁi½®Û(éhŽú
endstream
endobj";
var scanner = GetScanner(s);
var token = ReadToEnd(scanner)[0];
var stream = Assert.IsType<StreamToken>(token.Data);
var bytes = stream.Data.ToArray();
Assert.Equal(45, bytes.Length);
var outputString = Encoding.UTF8.GetString(bytes);
Assert.Equal("À“Éððr¥8»P£ØêÁi½®Û(éhŽú", outputString);
}
[Fact]
public void ReadsStreamWithIndirectLength()
{
const string s = @"5 0 obj 52 endobj
12 0 obj
<< /Length 5 0 R /S 1245 >>
stream
%¥×³®í»š}%§X{{tøNåÝž¶ö¢ÖÞ¾~´¼
endstream
endobj";
var locationProvider = new TestObjectLocationProvider();
locationProvider.Offsets[new IndirectReference(5, 0)] = 0;
var scanner = GetScanner(s, locationProvider);
var token = ReadToEnd(scanner)[1];
var stream = Assert.IsType<StreamToken>(token.Data);
var bytes = stream.Data.ToArray();
Assert.Equal(52, bytes.Length);
var outputString = Encoding.UTF8.GetString(bytes);
Assert.Equal("%¥×³®í»š}%§X{{tøNåÝž¶ö¢ÖÞ¾~´¼", outputString);
}
[Fact]
public void ReadsStreamWithMissingLength()
{
const string s = @"
12655 0 obj
<< /S 1245 >>
stream
%¥×³®í»š}%§X{{tøNåÝž¶ö¢ÖÞgrehtyyy$&%&£$££(*¾–~´¼
endstream
endobj";
var scanner = GetScanner(s);
var token = ReadToEnd(scanner)[0];
Assert.Equal(12655, token.Number.ObjectNumber);
var stream = Assert.IsType<StreamToken>(token.Data);
Assert.Equal("1245", stream.StreamDictionary.Data["S"].ToString());
Assert.Equal("%¥×³®í»š}%§X{{tøNåÝž¶ö¢ÖÞgrehtyyy$&%&£$££(*¾–~´¼", Encoding.UTF8.GetString(stream.Data.ToArray()));
}
[Fact]
public void ReadsStreamWithoutBreakBeforeEndstream()
{
const string s = @"
1 0 obj
12
endobj
7 0 obj
<< /Length 288
/Filter /FlateDecode >>
stream
xœ]ËjÃ0ÿÃ,ÓEð#NÒ€1¤N^ôA~€-]A- YYøï+Ï4¡t#qfîFWQY*­Dïv5:è”–§ñjB½Òa¤ •p7¤K  ƒÈûëyr8Tº!Ïà úð‚ÉÙVG9¶ø@Å7+Ñ*ÝÃ곬¹T_ùƵƒ8 Š$vË̗Ƽ6BDöu%½B¹yí$—Ù ¤\Hx71JœL#Ð6ºÇ0È㸀ü|. µüßõÏ""WÛ‰¯Æ.êÄ«ã8; ¤iL°!Ø %É`K°ßì¸ÃöÜáÜ)  [#CFðİ#(yƒg^ÿ¶æò
ÿž“¸Zë#¢?¢hP”Æû?šÑï÷ø¯‰Šendstream
endobj
9 0 obj
16
endobj";
var scanner = GetScanner(s);
var token = ReadToEnd(scanner)[1];
Assert.Equal(7, token.Number.ObjectNumber);
}
[Fact]
public void ReadsStringsWithMissingEndBracket()
{
const string input = @"5 0 obj
<<
/Kids [4 0 R 12 0 R 17 0 R 20 0 R 25 0 R 28 0 R ]
/Count 6
/Type /Pages
/MediaBox [ 0 0 612 792 ]
>>
endobj
1 0 obj
<<
/Creator (Corel WordPerfect - [D:\Wpdocs\WEBSITE\PROC&POL.WP6 (unmodified)
/CreationDate (D:19980224130723)
/Title (Proc&Pol.pdf)
/Author (J. L. Swezey)
/Producer (Acrobat PDFWriter 3.03 for Windows NT)
/Keywords (Budapest Treaty; Patent deposits; IDA)
/Subject (Patent Collection Procedures and Policies)
>>
endobj
3 0 obj
<<
/Pages 5 0 R
/Type /Catalog
>>
endobj";
var scanner = GetScanner(input);
var tokens = ReadToEnd(scanner);
Assert.Equal(3, tokens.Count);
var first = tokens[0];
Assert.Equal(5, first.Number.ObjectNumber);
var second = tokens[1];
Assert.Equal(1, second.Number.ObjectNumber);
var third = tokens[2];
Assert.Equal(3, third.Number.ObjectNumber);
}
[Fact]
public void ReadsDictionaryContainingNull()
{
const string input = @"14224 0 obj
<</Type /XRef
/Root 8 0 R
/Prev 116
/Length 84
/Size 35
/W [1 3 2]
/Index [0 1 6 1 8 2 25 10]
/ID [ (ù¸7<C2B8>ãA×<41>žòÜ4<C39C><34>Š•)]
/Info 6 0 R
/Encrypt null>>
endobj";
var scanner = GetScanner(input);
var tokens = ReadToEnd(scanner);
var dictionaryToken = tokens[0].Data as DictionaryToken;
Assert.NotNull(dictionaryToken);
var encryptValue = dictionaryToken.Data["Encrypt"];
Assert.IsType<NullToken>(encryptValue);
}
[Fact]
public void ReadMultipleNestedDictionary()
{
const string input =
@"
4 0 obj
<< /Type /Font /Subtype /Type1 /Name /AF1F040+Arial /BaseFont /Arial /FirstChar 32 /LastChar 255
/Encoding
<<
/Type /Encoding /BaseEncoding /WinAnsiEncoding
/Differences [128 /Euro 130 /quotesinglbase /florin /quotedblbase /ellipsis /dagger /daggerdbl /circumflex /perthousand /Scaron /guilsinglleft /OE 142 /Zcaron 145
/quoteleft /quoteright /quotedblleft /quotedblright /bullet /endash /emdash /tilde /trademark /scaron /guilsinglright /oe 158 /zcaron /Ydieresis /space /exclamdown
/cent /sterling /currency /yen /brokenbar /section /dieresis /copyright /ordfeminine /guillemotleft /logicalnot /hyphen /registered /macron /degree /plusminus
/twosuperior /threesuperior /acute /mu /paragraph /periodcentered /cedilla /onesuperior /ordmasculine /guillemotright /onequarter /onehalf /threequarters
/questiondown /Agrave /Aacute /Acircumflex /Atilde /Adieresis /Aring /AE /Ccedilla /Egrave /Eacute /Ecircumflex /Edieresis /Igrave /Iacute /Icircumflex /Idieresis
/Eth /Ntilde /Ograve /Oacute /Ocircumflex /Otilde /Odieresis /multiply /Oslash /Ugrave /Uacute /Ucircumflex /Udieresis /Yacute /Thorn /germandbls /agrave /aacute
/acircumflex /atilde /adieresis /aring /ae /ccedilla /egrave /eacute /ecircumflex /edieresis /igrave /iacute /icircumflex /idieresis /eth /ntilde /ograve /oacute
/ocircumflex /otilde /odieresis /divide /oslash /ugrave /uacute /ucircumflex /udieresis /yacute /thorn /ydieresis ]
>>
/Widths [278 278 355 556 556 889 667 191 333 333 389 584 278 333 278 278
556 556 556 556 556 556 556 556 556 556 278 278 584 584 584 556
1015 667 667 722 722 667 611 778 722 278 500 667 556 833 722 778
667 778 722 667 611 722 667 944 667 667 611 278 278 278 469 556
333 556 556 500 556 556 278 556 556 222 222 500 222 833 556 556
556 556 333 500 278 556 500 722 500 500 500 334 260 334 584 750
556 750 222 556 333 1000 556 556 333 1000 667 333 1000 750 611 750
750 222 222 333 333 350 556 1000 333 1000 500 333 944 750 500 667
278 333 556 556 556 556 260 556 333 737 370 556 584 333 737 552
400 549 333 333 333 576 537 278 333 333 365 556 834 834 834 611
667 667 667 667 667 667 1000 722 667 667 667 667 278 278 278 278
722 722 778 778 778 778 778 584 778 722 722 722 722 667 667 611
556 556 556 556 556 556 889 500 556 556 556 556 278 278 278 278
556 556 556 556 556 556 556 549 611 556 556 556 556 500 556 500
]
>>
>>
endobj
";
var scanner = GetScanner(input);
var tokens = ReadToEnd(scanner);
var dictionaryToken = tokens[0].Data as DictionaryToken;
Assert.NotNull(dictionaryToken);
}
private static PdfTokenScanner GetScanner(string s, TestObjectLocationProvider locationProvider = null)
{
var input = StringBytesTestConverter.Convert(s, false);
return new PdfTokenScanner(input.Bytes, locationProvider ?? new TestObjectLocationProvider(),
new TestFilterProvider(), NoOpEncryptionHandler.Instance);
}
private static IReadOnlyList<ObjectToken> ReadToEnd(PdfTokenScanner scanner)
{
var result = new List<ObjectToken>();
while (scanner.MoveNext())
{
if (scanner.CurrentToken is ObjectToken obj)
{
result.Add(obj);
}
else
{
throw new InvalidOperationException($"Pdf token scanner produced token which was not an object token: {scanner.CurrentToken}.");
}
}
return result;
}
}
}