PdfPig/src/UglyToad.PdfPig.Tests/Integration/PdfParserTests.cs

173 lines
6.4 KiB
C#
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

namespace UglyToad.PdfPig.Tests.Integration
{
using System;
using System.Collections.Generic;
using System.IO;
using System.IO.Compression;
using System.Linq;
using System.Text;
using PdfPig.Filters;
using PdfPig.Tokens;
using Xunit;
/*
* catalog
* the primary dictionary object containing references directly or indirectly to all other objects in the document with
* the exception that there may be objects in the trailer that are not referred to by the catalog
*
* General Format
* A one-line header identifying the version of the PDF specification to which the file conforms
* • A body containing the objects that make up the document contained in the file
* • A cross-reference table containing information about the indirect objects in the file
* • A trailer giving the location of the cross-reference table and of certain special objects within the body of the file
*
* Each line shall be terminated by an end-of-line (EOL) marker, which may be a CARRIAGE RETURN (0Dh), a
* LINE FEED (0Ah), or both. PDF files with binary data may have arbitrarily long lines.
*
* The first line of a PDF file shall be a header consisting of the 5 characters %PDF followed by a version
* number of the form 1.N, where N is a digit between 0 and 7.
* A conforming reader shall accept files with any of the following headers:
* %PDF1.0
* %PDF1.1
* %PDF1.2
* %PDF1.3
* %PDF1.4
* %PDF1.5
* %PDF1.6
* %PDF1.7
* Beginning with PDF 1.4, the Version entry in the documents catalog dictionary (located via the Root entry in
* the files trailer, as described in 7.5.5, "File Trailer"), if present, shall be used instead of the version specified in
* the Header.
*/
public class PdfParserTests
{
[Fact]
public void CanDecompressNormalObjectStream()
{
var bytes = File.ReadAllBytes(GetNthFilename());
// The 7th object is encoded with no flate parameters.
var obj7 = GetOffset(bytes, "7 0 obj");
var streamPosition = GetOffset(bytes, "stream", obj7.end);
var endStreamPosition = GetOffset(bytes, "endstream", streamPosition.end);
var streamBytes = BytesBetween(streamPosition.end + 1, endStreamPosition.start - 1, bytes);
var decodedBytes = new List<int>();
using (var memo = new MemoryStream(streamBytes))
{
// Skip the first 2 header bytes, C#'s DeflateStream can't deal with them.
memo.ReadByte();
memo.ReadByte();
using (var str = new DeflateStream(memo, CompressionMode.Decompress))
{
var x = str.ReadByte();
while (x != -1)
{
decodedBytes.Add(x);
x = str.ReadByte();
}
}
}
}
[Fact]
public void CanDecompressPngEncodedFlateStream()
{
// The Xref stream is encoded with parameters.
// For flate as per http://www.adobe.com/content/dam/Adobe/en/devnet/acrobat/pdfs/pdf_reference_1-7.pdf
var bytes = File.ReadAllBytes(GetNthFilename());
var streamPosition = GetOffset(bytes, "stream");
var endStreamPosition = GetOffset(bytes, "endstream", streamPosition.end);
var streamBytes = BytesBetween(streamPosition.end + 1, endStreamPosition.start - 1, bytes);
var paramsDict = new DictionaryToken(new Dictionary<NameToken, IToken>
{
{ NameToken.Predictor, new NumericToken(12) },
{ NameToken.Columns, new NumericToken(4) }
});
var dictionary = new DictionaryToken(new Dictionary<NameToken, IToken>
{
{NameToken.Filter, NameToken.FlateDecode},
{NameToken.DecodeParms, paramsDict}
});
var filter = new FlateFilter(new DecodeParameterResolver(null), new PngPredictor(), null);
var filtered = filter.Decode(streamBytes, dictionary, 0);
var expected =
"1 0 15 0 1 0 216 0 1 2 160 0 1 2 210 0 1 3 84 0 1 4 46 0 1 7 165 0 1 70 229 0 1 72 84 0 1 96 235 0 1 98 18 0 2 0 12 0 2 0 12 1 2 0 12 2 2 0 12 3 2 0 12 4 2 0 12 5 2 0 12 6 2 0 12 7 2 0 12 8"
.Split(' ')
.Select(byte.Parse).ToArray();
Assert.Equal(filtered, expected);
}
/// <summary>
/// GetLongOrDefault the start and end offset of a search term.
/// </summary>
private static (int start, int end) GetOffset(IReadOnlyList<byte> input, string term, int offset = 0)
{
var streamBytes = Encoding.UTF8.GetBytes(term);
var index = 0;
var outerFound = false;
for (int i = offset; i < input.Count; i++)
{
if (input[i] == streamBytes[0])
{
bool found = true;
for (int j = 1; j < streamBytes.Length; j++)
{
if (input[i + j] != streamBytes[j])
{
found = false;
break;
}
}
if (found)
{
index = i;
outerFound = true;
break;
}
}
}
if (!outerFound)
{
throw new InvalidOperationException("Could not find a stream in the bytes");
}
return (index, index + term.Length);
}
private static byte[] BytesBetween(int start, int endExclusive, byte[] bytes)
{
var result = new List<byte>();
for (var i = start; i < endExclusive; i++)
{
result.Add(bytes[i]);
}
return result.ToArray();
}
private static string GetNthFilename()
{
var documentFolder = Path.GetFullPath(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "..", "..", "..", "Integration", "Documents"));
return Path.Combine(documentFolder, "Single Page Simple - from google drive.pdf");
}
}
}