PdfPig/src/UglyToad.PdfPig/Writer/TokenWriter.cs
2024-03-15 13:10:25 +00:00

686 lines
25 KiB
C#

namespace UglyToad.PdfPig.Writer
{
using System;
using System.Collections.Generic;
using System.Globalization;
using System.Text;
using System.IO;
using System.Linq;
using Core;
using Graphics.Operations;
using Tokens;
using Util;
/// <summary>
/// Writes any type of <see cref="IToken"/> to the corresponding PDF document format output.
/// </summary>
public class TokenWriter : ITokenWriter
{
private static readonly byte ArrayStart = GetByte("[");
private static readonly byte ArrayEnd = GetByte("]");
private static readonly byte[] DictionaryStart = OtherEncodings.StringAsLatin1Bytes("<<");
private static readonly byte[] DictionaryEnd = OtherEncodings.StringAsLatin1Bytes(">>");
private static readonly byte Comment = GetByte("%");
private static readonly byte[] Eof = OtherEncodings.StringAsLatin1Bytes("%%EOF");
private static readonly byte[] FalseBytes = OtherEncodings.StringAsLatin1Bytes("false");
private static readonly byte HexStart = GetByte("<");
private static readonly byte HexEnd = GetByte(">");
private static readonly byte InUseEntry = GetByte("n");
private static readonly byte NameStart = GetByte("/");
private static readonly byte[] Null = OtherEncodings.StringAsLatin1Bytes("null");
private static readonly byte[] ObjStart = OtherEncodings.StringAsLatin1Bytes("obj");
private static readonly byte[] ObjEnd = OtherEncodings.StringAsLatin1Bytes("endobj");
private static readonly byte RByte = GetByte("R");
private static readonly byte[] StartXref = OtherEncodings.StringAsLatin1Bytes("startxref");
/// <summary>
/// Bytes that indicate start of stream
/// </summary>
protected static readonly byte[] StreamStart = OtherEncodings.StringAsLatin1Bytes("stream");
/// <summary>
/// Bytes that indicate end start of stream
/// </summary>
protected static readonly byte[] StreamEnd = OtherEncodings.StringAsLatin1Bytes("endstream");
private static readonly byte StringStart = GetByte("(");
private static readonly byte StringEnd = GetByte(")");
private static readonly byte[] Trailer = OtherEncodings.StringAsLatin1Bytes("trailer");
private static readonly byte[] TrueBytes = OtherEncodings.StringAsLatin1Bytes("true");
private static readonly byte Whitespace = GetByte(" ");
private static readonly byte[] Xref = OtherEncodings.StringAsLatin1Bytes("xref");
private static readonly HashSet<char> DelimiterChars = new HashSet<char>
{
'(',
')',
'<',
'>',
'[',
']',
'{',
'}',
'/',
'%'
};
/// <summary>
/// Single global instance
/// </summary>
public static TokenWriter Instance { get; } = new TokenWriter();
/// <summary>
/// Writes the given input token to the output stream with the correct PDF format and encoding including whitespace and line breaks as applicable.
/// </summary>
/// <param name="token">The token to write to the stream.</param>
/// <param name="outputStream">The stream to write the token to.</param>
public void WriteToken(IToken token, Stream outputStream)
{
if (token == null)
{
WriteNullToken(outputStream);
return;
}
switch (token)
{
case ArrayToken array:
WriteArray(array, outputStream);
break;
case BooleanToken boolean:
WriteBoolean(boolean, outputStream);
break;
case CommentToken comment:
WriteComment(comment, outputStream);
break;
case DictionaryToken dictionary:
WriteDictionary(dictionary, outputStream);
break;
case HexToken hex:
WriteHex(hex, outputStream);
break;
case IndirectReferenceToken reference:
WriteIndirectReference(reference, outputStream);
break;
case NameToken name:
WriteName(name, outputStream);
break;
case NullToken _:
outputStream.Write(Null, 0, Null.Length);
WriteWhitespace(outputStream);
break;
case NumericToken number:
WriteNumber(number, outputStream);
break;
case ObjectToken objectToken:
WriteObject(objectToken, outputStream);
break;
case StreamToken streamToken:
WriteStream(streamToken, outputStream);
break;
case StringToken stringToken:
WriteString(stringToken, outputStream);
break;
default:
throw new PdfDocumentFormatException($"Attempted to write token type of {token.GetType()} but was not known.");
}
}
/// <inheritdoc cref="ITokenWriter.WriteCrossReferenceTable" />
public void WriteCrossReferenceTable(IReadOnlyDictionary<IndirectReference, long> objectOffsets,
IndirectReference catalogToken,
Stream outputStream,
IndirectReference? documentInformationReference)
{
if (objectOffsets.Count == 0)
{
throw new InvalidOperationException("Could not write empty cross reference table.");
}
WriteLineBreak(outputStream);
var position = outputStream.Position;
outputStream.Write(Xref, 0, Xref.Length);
WriteLineBreak(outputStream);
var sets = new List<XrefSeries>();
var orderedList = objectOffsets.OrderBy(x => x.Key.ObjectNumber).ToList();
long firstObjectNumber = 0;
long currentObjNum = 0;
var items = new List<XrefSeries.OffsetAndGeneration>
{
// Zero entry
null
};
foreach (var item in orderedList)
{
var step = item.Key.ObjectNumber - currentObjNum;
if (step == 1)
{
currentObjNum = item.Key.ObjectNumber;
items.Add(new XrefSeries.OffsetAndGeneration(item.Value, item.Key.Generation));
}
else
{
sets.Add(new XrefSeries(firstObjectNumber, items));
items = new List<XrefSeries.OffsetAndGeneration>
{
new XrefSeries.OffsetAndGeneration(item.Value, item.Key.Generation)
};
currentObjNum = item.Key.ObjectNumber;
firstObjectNumber = item.Key.ObjectNumber;
}
}
if (items.Count > 0)
{
sets.Add(new XrefSeries(firstObjectNumber, items));
}
foreach (var series in sets)
{
WriteLong(series.First, outputStream);
WriteWhitespace(outputStream);
WriteLong(series.Offsets.Count, outputStream);
WriteWhitespace(outputStream);
WriteLineBreak(outputStream);
foreach (var offset in series.Offsets)
{
if (offset != null)
{
/*
* nnnnnnnnnn ggggg n eol
* where:
* nnnnnnnnnn is a 10-digit byte offset
* ggggg is a 5-digit generation number
* n is a literal keyword identifying this as an in-use entry
* eol is a 2-character end-of-line sequence ('\r\n' or ' \n')
*/
var paddedOffset = OtherEncodings.StringAsLatin1Bytes(offset.Offset.ToString("D10", CultureInfo.InvariantCulture));
outputStream.Write(paddedOffset, 0, paddedOffset.Length);
WriteWhitespace(outputStream);
var generation = OtherEncodings.StringAsLatin1Bytes(offset.Generation.ToString("D5", CultureInfo.InvariantCulture));
outputStream.Write(generation, 0, generation.Length);
WriteWhitespace(outputStream);
outputStream.WriteByte(InUseEntry);
WriteWhitespace(outputStream);
WriteLineBreak(outputStream);
}
else
{
WriteFirstXrefEmptyEntry(outputStream);
}
}
}
outputStream.Write(Trailer, 0, Trailer.Length);
WriteLineBreak(outputStream);
var identifier = new ArrayToken(new IToken[]
{
new HexToken(Guid.NewGuid().ToString("N").ToCharArray()),
new HexToken(Guid.NewGuid().ToString("N").ToCharArray())
});
var trailerDictionaryData = new Dictionary<NameToken, IToken>
{
// 1 for the free entry.
{NameToken.Size, new NumericToken(objectOffsets.Count + 1)},
{NameToken.Root, new IndirectReferenceToken(catalogToken)},
{NameToken.Id, identifier}
};
if (documentInformationReference.HasValue)
{
trailerDictionaryData[NameToken.Info] = new IndirectReferenceToken(documentInformationReference.Value);
}
var trailerDictionary = new DictionaryToken(trailerDictionaryData);
WriteDictionary(trailerDictionary, outputStream);
WriteLineBreak(outputStream);
outputStream.Write(StartXref, 0, StartXref.Length);
WriteLineBreak(outputStream);
WriteLong(position, outputStream);
WriteLineBreak(outputStream);
// Complete!
outputStream.Write(Eof, 0, Eof.Length);
}
/// <summary>
/// Indicates that we are writing page contents.
/// Can be used by a derived class.
/// </summary>
public bool WritingPageContents { get; set; }
/// <inheritdoc cref="ITokenWriter.WriteObject" />
public void WriteObject(long objectNumber, int generation, byte[] data, Stream outputStream)
{
WriteLong(objectNumber, outputStream);
WriteWhitespace(outputStream);
WriteInt(generation, outputStream);
WriteWhitespace(outputStream);
outputStream.Write(ObjStart, 0, ObjStart.Length);
WriteLineBreak(outputStream);
outputStream.Write(data, 0, data.Length);
WriteLineBreak(outputStream);
outputStream.Write(ObjEnd, 0, ObjEnd.Length);
WriteLineBreak(outputStream);
}
/// <summary>
/// Write a hex value to the output stream
/// </summary>
protected void WriteHex(HexToken hex, Stream stream)
{
stream.WriteByte(HexStart);
stream.WriteText(hex.GetHexString());
stream.WriteByte(HexEnd);
}
/// <summary>
/// Write an array to the output stream, with whitespace at the end.
/// </summary>
protected void WriteArray(ArrayToken array, Stream outputStream)
{
outputStream.WriteByte(ArrayStart);
WriteWhitespace(outputStream);
for (var i = 0; i < array.Data.Count; i++)
{
var value = array.Data[i];
WriteToken(value, outputStream);
}
outputStream.WriteByte(ArrayEnd);
WriteWhitespace(outputStream);
}
/// <summary>
/// Write a boolean "true" or "false" to the output stream, with whitespace at the end.
/// </summary>
protected void WriteBoolean(BooleanToken boolean, Stream outputStream)
{
var bytes = boolean.Data ? TrueBytes : FalseBytes;
outputStream.Write(bytes, 0, bytes.Length);
WriteWhitespace(outputStream);
}
/// <summary>
/// Write a "%comment" in the output stream, with a line break at the end.
/// </summary>
protected void WriteComment(CommentToken comment, Stream outputStream)
{
var bytes = OtherEncodings.StringAsLatin1Bytes(comment.Data);
outputStream.WriteByte(Comment);
outputStream.Write(bytes, 0, bytes.Length);
WriteLineBreak(outputStream);
}
/// <summary>
/// Write "null" in the output stream with a whitespace at the end.
/// </summary>
protected void WriteNullToken(Stream outputStream)
{
var bytes = OtherEncodings.StringAsLatin1Bytes("null");
outputStream.Write(bytes, 0, bytes.Length);
WriteWhitespace(outputStream);
}
/// <summary>
/// Writes dictionary key/value pairs to output stream as Name/Token pairs.
/// </summary>
/// <param name="dictionary"></param>
/// <param name="outputStream"></param>
protected void WriteDictionary(DictionaryToken dictionary, Stream outputStream)
{
outputStream.Write(DictionaryStart, 0, DictionaryStart.Length);
foreach (var pair in dictionary.Data)
{
WriteName(pair.Key, outputStream);
// handle scenario where PdfPig has a null value under some circumstances
if (pair.Value == null)
{
WriteToken(NullToken.Instance, outputStream);
}
else
{
WriteToken(pair.Value, outputStream);
}
}
outputStream.Write(DictionaryEnd, 0, DictionaryEnd.Length);
}
/// <summary>
/// Write an indirect reference to the stream, with whitespace at the end.
/// </summary>
/// <param name="reference"></param>
/// <param name="outputStream"></param>
protected virtual void WriteIndirectReference(IndirectReferenceToken reference, Stream outputStream)
{
WriteLong(reference.Data.ObjectNumber, outputStream);
WriteWhitespace(outputStream);
WriteInt(reference.Data.Generation, outputStream);
WriteWhitespace(outputStream);
outputStream.WriteByte(RByte);
WriteWhitespace(outputStream);
}
/// <summary>
/// Write a name to the stream, with whitespace at the end.
/// </summary>
/// <param name="name"></param>
/// <param name="outputStream"></param>
protected virtual void WriteName(NameToken name, Stream outputStream)
{
WriteName(name.Data, outputStream);
}
private void WriteName(string name, Stream outputStream)
{
/*
* Beginning with PDF 1.2, any character except null (character code 0) may be
* included in a name by writing its 2-digit hexadecimal code, preceded by the number sign character (#).
* This is required for delimiter and whitespace characters.
* This is recommended for characters whose codes are outside the range 33 (!) to 126 (~).
*/
var sb = new StringBuilder();
foreach (var c in name)
{
if (c < 33 || c > 126 || DelimiterChars.Contains(c))
{
var str = Hex.GetString([(byte)c]);
sb.Append('#').Append(str);
}
else
{
sb.Append(c);
}
}
var bytes = OtherEncodings.StringAsLatin1Bytes(sb.ToString());
outputStream.WriteByte(NameStart);
outputStream.Write(bytes, 0, bytes.Length);
WriteWhitespace(outputStream);
}
/// <summary>
/// Write a number to the stream, with whitespace at the end.
/// </summary>
/// <param name="number"></param>
/// <param name="outputStream"></param>
protected virtual void WriteNumber(NumericToken number, Stream outputStream)
{
if (!number.HasDecimalPlaces)
{
WriteInt(number.Int, outputStream);
}
else
{
var bytes = OtherEncodings.StringAsLatin1Bytes(number.Data.ToString("G", CultureInfo.InvariantCulture));
outputStream.Write(bytes, 0, bytes.Length);
}
WriteWhitespace(outputStream);
}
/// <summary>
/// Write an object to the stream, with a line break at the end. It writes the following contents:
/// - "[ObjectNumber] [Generation] obj"
/// - Object data
/// - "endobj"
/// </summary>
/// <param name="objectToken"></param>
/// <param name="outputStream"></param>
protected virtual void WriteObject(ObjectToken objectToken, Stream outputStream)
{
WriteLong(objectToken.Number.ObjectNumber, outputStream);
WriteWhitespace(outputStream);
WriteInt(objectToken.Number.Generation, outputStream);
WriteWhitespace(outputStream);
outputStream.Write(ObjStart, 0, ObjStart.Length);
WriteLineBreak(outputStream);
WriteToken(objectToken.Data, outputStream);
WriteLineBreak(outputStream);
outputStream.Write(ObjEnd, 0, ObjEnd.Length);
WriteLineBreak(outputStream);
}
/// <summary>
/// Write a stream token to the output stream, with the following contents:
/// - Dictionary specifying the length of the stream, any applied compression filters and additional information.
/// - Stream start indicator
/// - Bytes in the StreamToken data
/// - Stream end indicator
/// </summary>
/// <param name="streamToken"></param>
/// <param name="outputStream"></param>
protected virtual void WriteStream(StreamToken streamToken, Stream outputStream)
{
WriteDictionary(streamToken.StreamDictionary, outputStream);
WriteLineBreak(outputStream);
outputStream.Write(StreamStart, 0, StreamStart.Length);
WriteLineBreak(outputStream);
outputStream.Write(streamToken.Data.ToArray(), 0, streamToken.Data.Count);
WriteLineBreak(outputStream);
outputStream.Write(StreamEnd, 0, StreamEnd.Length);
}
private static readonly int[] EscapeNeeded = new int[]
{
'\r', '\n', '\t', '\b', '\f', '\\'
};
private static readonly int[] Escaped = new int[]
{
'r', 'n', 't', 'b', 'f', '\\'
};
/// <summary>
/// Write string to the stream, with whitespace at the end
/// </summary>
protected virtual void WriteString(StringToken stringToken, Stream outputStream)
{
outputStream.WriteByte(StringStart);
if (stringToken.EncodedWith == StringToken.Encoding.Iso88591 || stringToken.EncodedWith == StringToken.Encoding.PdfDocEncoding)
{
// iso 88591 (or really PdfDocEncoding in non-contentstream circumstances shouldn't
// have these chars but seems like internally this isn't obeyed (see:
// CanCreateDocumentInformationDictionaryWithNonAsciiCharacters test) and it may
// happen during parsing as well -> switch to unicode
var data = stringToken.Data.ToCharArray();
if (data.Any(x => x > 255))
{
data = new StringToken(stringToken.Data, StringToken.Encoding.Utf16BE)
.GetBytes()
.Select(b => (char)b)
.ToArray();
}
int ei;
for (var i = 0; i < data.Length; i++)
{
var c = (int)data[i];
if (c == (int)'(' || c == (int)')') // wastes a little space if escaping not needed but better than forward searching
{
outputStream.WriteByte((byte)'\\');
outputStream.WriteByte((byte)c);
}
else if ((ei = Array.IndexOf(EscapeNeeded, c)) > -1)
{
outputStream.WriteByte((byte)'\\');
outputStream.WriteByte((byte)Escaped[ei]);
}
else if (c < 32 || c > 126) // non printable
{
var b3 = c / 64;
var b2 = (c - b3 * 64) / 8;
var b1 = c % 8;
outputStream.WriteByte((byte)'\\');
outputStream.WriteByte((byte)(b3 + '0'));
outputStream.WriteByte((byte)(b2 + '0'));
outputStream.WriteByte((byte)(b1 + '0'));
}
else
{
outputStream.WriteByte((byte)c);
}
}
}
else
{
var bytes = stringToken.GetBytes();
outputStream.Write(bytes, 0, bytes.Length);
}
outputStream.WriteByte(StringEnd);
WriteWhitespace(outputStream);
}
/// <summary>
/// Write an integer to the stream
/// </summary>
/// <param name="value"></param>
/// <param name="outputStream"></param>
protected virtual void WriteInt(int value, Stream outputStream)
{
var bytes = OtherEncodings.StringAsLatin1Bytes(value.ToString("G", CultureInfo.InvariantCulture));
outputStream.Write(bytes, 0, bytes.Length);
}
/// <summary>
/// Write a line break to the output stream
/// </summary>
/// <param name="outputStream"></param>
protected virtual void WriteLineBreak(Stream outputStream)
{
outputStream.WriteNewLine();
}
/// <summary>
/// Write a long to the stream
/// </summary>
/// <param name="value"></param>
/// <param name="outputStream"></param>
protected virtual void WriteLong(long value, Stream outputStream)
{
var bytes = OtherEncodings.StringAsLatin1Bytes(value.ToString("G", CultureInfo.InvariantCulture));
outputStream.Write(bytes, 0, bytes.Length);
}
/// <summary>
/// Write a space to the output stream
/// </summary>
/// <param name="outputStream"></param>
protected virtual void WriteWhitespace(Stream outputStream)
{
outputStream.WriteByte(Whitespace);
}
private void WriteFirstXrefEmptyEntry(Stream outputStream)
{
/*
* The first entry in the table (object number 0) is always free and has a generation number of 65,535;
* it is the head of the linked list of free objects.
*/
outputStream.WriteText(new string('0', 10));
outputStream.WriteWhiteSpace();
outputStream.WriteText("65535");
outputStream.WriteWhiteSpace();
outputStream.WriteText("f");
outputStream.WriteWhiteSpace();
outputStream.WriteNewLine();
}
private static byte GetByte(string value)
{
var bytes = OtherEncodings.StringAsLatin1Bytes(value);
if (bytes.Length > 1)
{
throw new InvalidOperationException();
}
return bytes[0];
}
private class XrefSeries
{
public long First { get; }
public IReadOnlyList<OffsetAndGeneration> Offsets { get; }
public XrefSeries(long first, IReadOnlyList<OffsetAndGeneration> offsets)
{
First = first;
Offsets = offsets;
}
public class OffsetAndGeneration
{
public long Offset { get; }
public long Generation { get; }
public OffsetAndGeneration(long offset, long generation)
{
Offset = offset;
Generation = generation;
}
}
}
}
}