diff --git a/src/UglyToad.Pdf.Tests/Filters/Ascii85FilterTests.cs b/src/UglyToad.Pdf.Tests/Filters/Ascii85FilterTests.cs new file mode 100644 index 00000000..898f5bc5 --- /dev/null +++ b/src/UglyToad.Pdf.Tests/Filters/Ascii85FilterTests.cs @@ -0,0 +1,79 @@ +namespace UglyToad.Pdf.Tests.Filters +{ + using System.Text; + using ContentStream; + using Pdf.Filters; + using Xunit; + + public class Ascii85FilterTests + { + private readonly Ascii85Filter filter = new Ascii85Filter(); + + [Fact] + public void DecodesWikipediaExample() + { + var bytes = Encoding.ASCII.GetBytes( + @"9jqo^BlbD-BleB1DJ+*+F(f,q/0JhKFCj@.4Gp$d7F!,L7@<6@)/0JDEF@3BB / F * &OCAfu2 / AKY + i(DIb: @FD, *) + C]U =@3BN#EcYf8ATD3s@q?d$AftVqCh[NqF-FD5W8ARlolDIa + l(DId uD.RTpAKYo'+CT/5+Cei#DII?(E,9)oF*2M7/c~>"); + + var result = filter.Decode(bytes, new PdfDictionary(), 0); + + var text = Encoding.ASCII.GetString(result); + + Assert.Equal("Man is distinguished, not only by his reason, but by this singular passion from other animals, which is a lust of the mind, " + + "that by a perseverance of delight in the continued and indefatigable generation of knowledge, " + + "exceeds the short vehemence of any carnal pleasure.", + text); + } + + [Fact] + public void DecodesEncodedPdfContent() + { + const string input = + @"0d&.mDdmGg4?O`>9P&*SFD)dS2E2gC4pl@QEb/Zr$8N_r$:7]!01IZ=0eskNAdU47<+?7h+B3Ol2_m!C+?)#1+B1 +`9>:0H_br.:""&q8d[6p/M +T()<(%'A;f?Ma+CT;%+E_a:A0>K&EZek1D/aN,F)u&6DBNA*A0>f4BOu4*+EM76E,9eK+B3(_<%9""p.!0AMEb031ATMF#Fp**=$,E,oN2F(oQ1+D#G#De*R""B-;&&FD,T'F!+n3AKY4b +F*22=@:F%a+=SF4C'moi+=Li?EZeh0FD)e-@<>p#@;]TuBl.9kATKCFGA(],AKYo5BOu4*+CT;%+C#7pF_Pr+@VfTuDf0B:+=SF4C'moi+= +Li?EZek1DKKT1F`2DD/TboKAKY](@:s.m/h%oBC'mC/$>""*cF*)G6@;Q?_DIdZpC&~>"; + + var result = filter.Decode(Encoding.ASCII.GetBytes(input), new PdfDictionary(), 0); + + var text = Encoding.ASCII.GetString(result); + + const string expected = @"1 0 obj +<< /Length 568 >> +stream +2 J +BT +/F1 12 Tf +0 Tc +0 Tw +72.5 712 TD +[(Unencoded streams can be read easily) 65 (, )] TJ +0 -14 TD +[(b) 20 (ut generally tak) 10 (e more space than \311)] TJ +T* (encoded streams.) Tj +0 -28 TD +[(Se) 25 (v) 15 (eral encoding methods are a) 20 (v) 25 (ailable in PDF) 80 (.)] TJ +0 -14 TD +(Some are used for compression and others simply) Tj +T* [(to represent binary data in an ) 55 (ASCII format.)] TJ +T* (Some of the compression encoding methods are \ +suitable ) Tj +T* (for both data and images, while others are \ +suitable only ) Tj +T* (for continuous-tone images.) Tj +ET +endstream +endobj"; + + Assert.Equal(expected.Replace("\r\n", "\n"), text); + } + } +} diff --git a/src/UglyToad.Pdf/Filters/Ascii85Filter.cs b/src/UglyToad.Pdf/Filters/Ascii85Filter.cs new file mode 100644 index 00000000..ca441494 --- /dev/null +++ b/src/UglyToad.Pdf/Filters/Ascii85Filter.cs @@ -0,0 +1,145 @@ +namespace UglyToad.Pdf.Filters +{ + using System; + using System.IO; + using ContentStream; + + /// + /// ASCII 85 (Base85) is a binary to text encoding using 5 ASCII characters per 4 bytes of data. + /// + public class Ascii85Filter : IFilter + { + private const byte EmptyBlock = (byte)'z'; + private const byte Offset = (byte)'!'; + private const byte EmptyCharacterPadding = (byte) 'u'; + + private static readonly byte[] EndOfDataBytes = { (byte)'~', (byte)'>' }; + + private static readonly int[] PowerByIndex = { + 1, + 85, + 85 * 85, + 85 * 85 * 85, + 85 * 85 * 85 *85 + }; + + public byte[] Decode(byte[] input, PdfDictionary streamDictionary, int filterIndex) + { + var asciiBuffer = new byte[5]; + + var index = 0; + + using (var stream = new MemoryStream()) + using (var writer = new BinaryWriter(stream)) + { + + + for (var i = 0; i < input.Length; i++) + { + var value = input[i]; + + if (IsWhiteSpace(value)) + { + continue; + } + + if (value == EndOfDataBytes[0]) + { + if (i == input.Length - 1 || input[i + 1] == EndOfDataBytes[1]) + { + if (index > 0) + { + WriteData(asciiBuffer, index, writer); + } + + // The end + break; + } + + // TODO: this shouldn't be possible? + } + + if (value == EmptyBlock) + { + if (index > 0) + { + // Should I throw here? + } + + for (int j = 0; j < 4; j++) + { + writer.Write(0); + } + + index = 0; + + // We've completed our block. + } + else + { + asciiBuffer[index] = (byte) (value - Offset); + index++; + } + + if (index == 5) + { + WriteData(asciiBuffer, index, writer); + index = 0; + } + } + + writer.Flush(); + + return stream.ToArray(); + } + } + + private static void WriteData(byte[] ascii, int index, BinaryWriter writer) + { + if (index < 2) + { + throw new ArgumentOutOfRangeException(nameof(index), "Cannot convert a block padded by 4 'u' characters."); + } + + // Write any empty padding if the block ended early. + for (var i = index; i < 5; i++) + { + ascii[i] = EmptyCharacterPadding - Offset; + } + + int value = 0; + value += ascii[0] * PowerByIndex[4]; + value += ascii[1] * PowerByIndex[3]; + value += ascii[2] * PowerByIndex[2]; + value += ascii[3] * PowerByIndex[1]; + value += ascii[4] * PowerByIndex[0]; + + writer.Write((byte)(value >> 24)); + + if (index > 2) + { + writer.Write((byte) (value >> 16)); + } + + if (index > 3) + { + writer.Write((byte) (value >> 8)); + } + + if (index > 4) + { + writer.Write((byte) value); + } + } + + private static bool IsWhiteSpace(byte b) + { + if (b == '\r' || b == '\n' || b == ' ') + { + return true; + } + + return false; + } + } +} diff --git a/src/UglyToad.Pdf/Filters/IFilterProvider.cs b/src/UglyToad.Pdf/Filters/IFilterProvider.cs index aafc9a15..228cb656 100644 --- a/src/UglyToad.Pdf/Filters/IFilterProvider.cs +++ b/src/UglyToad.Pdf/Filters/IFilterProvider.cs @@ -21,11 +21,14 @@ public MemoryFilterProvider(IDecodeParameterResolver decodeParameterResolver, IPngPredictor pngPredictor, ILog log) { IFilter FlateFunc() => new FlateFilter(decodeParameterResolver, pngPredictor, log); + IFilter Ascii85Func() => new Ascii85Filter(); filterFactories = new Dictionary> { {CosName.FLATE_DECODE, FlateFunc}, {CosName.FLATE_DECODE_ABBREVIATION, FlateFunc}, + {CosName.ASCII85_DECODE, Ascii85Func}, + {CosName.ASCII85_DECODE_ABBREVIATION, Ascii85Func} }; }