add support for ascii 85 decoding

This commit is contained in:
Eliot Jones
2017-12-24 22:34:08 +00:00
parent b0960872a5
commit a3010e20ef
3 changed files with 227 additions and 0 deletions

View File

@@ -0,0 +1,79 @@
namespace UglyToad.Pdf.Tests.Filters
{
using System.Text;
using ContentStream;
using Pdf.Filters;
using Xunit;
public class Ascii85FilterTests
{
private readonly Ascii85Filter filter = new Ascii85Filter();
[Fact]
public void DecodesWikipediaExample()
{
var bytes = Encoding.ASCII.GetBytes(
@"9jqo^BlbD-BleB1DJ+*+F(f,q/0JhKF<GL>Cj@.4Gp$d7F!,L7@<6@)/0JDEF<G%<+EV:2F!,
O<DJ+*.@<*K0@<6L(Df-\0Ec5e;DffZ(EZee.Bl.9pF""AGXBPCsi + DGm >@3BB / F * &OCAfu2 / AKY
i(DIb: @FD, *) + C]U =@3BN#EcYf8ATD3s@q?d$AftVqCh[NqF<G:8+EV:.+Cf>-FD5W8ARlolDIa
l(DId<j@<? 3r@:F % a + D58'ATD4$Bl@l3De:,-DJs`8ARoFb/0JMK@qB4^F!,R<AKZ&-DfTqBG%G
> uD.RTpAKYo'+CT/5+Cei#DII?(E,9)oF*2M7/c~>");
var result = filter.Decode(bytes, new PdfDictionary(), 0);
var text = Encoding.ASCII.GetString(result);
Assert.Equal("Man is distinguished, not only by his reason, but by this singular passion from other animals, which is a lust of the mind, " +
"that by a perseverance of delight in the continued and indefatigable generation of knowledge, " +
"exceeds the short vehemence of any carnal pleasure.",
text);
}
[Fact]
public void DecodesEncodedPdfContent()
{
const string input =
@"0d&.mDdmGg4?O`>9P&*SFD)dS2E2gC4pl@QEb/Zr$8N_r$:7]!01IZ=0eskNAdU47<+?7h+B3Ol2_m!C+?)#1+B1
`9>:<KhASu!rA7]9oF*)G6@;U'.@ps6t@V$[&ART*lARTXoCj@HP2DlU*/0HBI+B1r?0H_r%1a#ac$<nof.3LB""+=MAS+D58'ATD3qCj@.
F@;@;70ea^uAKYi.Eb-A7E+*6f+EV:*DBN1?0ek+_+B1r?<%9""=ASu!rA7]9oF*)G6@;U'<.3MT)$8<SS1,pCU6jd-H;e7
C#1,U1&Ft""Og2'=;YEa`c,ASu!rA8,po+Dk\3BQ%F&+CT;%+CQ]A1,'h!Ft""Oh2'=;UBl%3eCh4`'DBMbD7O]H>0H_br.:""&q8d[6p/M
T()<(%'A;f?Ma+CT;%+E_a:A0>K&EZek1D/aN,F)u&6DBNA*A0>f4BOu4*+EM76E,9eK+B3(_<%9""p.!0AMEb031ATMF#F<G%,DIIR2+Cno
&@3B9%+CT.1.3LK*+=KNS6V0ilAoD^,@<=+N>p**=$</Jt-rY&$AKYo'+EV:.+Cf>,E,oN2F(oQ1+D#G#De*R""B-;&&FD,T'F!+n3AKY4b
F*22=@:F%a+=SF4C'moi+=Li?EZeh0FD)e-@<>p#@;]TuBl.9kATKCFGA(],AKYo5BOu4*+CT;%+C#7pF_Pr+@VfTuDf0B:+=SF4C'moi+=
Li?EZek1DKKT1F`2DD/TboKAKY](@:s.m/h%oBC'mC/$>""*cF*)G6@;Q?_DIdZpC&~>";
var result = filter.Decode(Encoding.ASCII.GetBytes(input), new PdfDictionary(), 0);
var text = Encoding.ASCII.GetString(result);
const string expected = @"1 0 obj
<< /Length 568 >>
stream
2 J
BT
/F1 12 Tf
0 Tc
0 Tw
72.5 712 TD
[(Unencoded streams can be read easily) 65 (, )] TJ
0 -14 TD
[(b) 20 (ut generally tak) 10 (e more space than \311)] TJ
T* (encoded streams.) Tj
0 -28 TD
[(Se) 25 (v) 15 (eral encoding methods are a) 20 (v) 25 (ailable in PDF) 80 (.)] TJ
0 -14 TD
(Some are used for compression and others simply) Tj
T* [(to represent binary data in an ) 55 (ASCII format.)] TJ
T* (Some of the compression encoding methods are \
suitable ) Tj
T* (for both data and images, while others are \
suitable only ) Tj
T* (for continuous-tone images.) Tj
ET
endstream
endobj";
Assert.Equal(expected.Replace("\r\n", "\n"), text);
}
}
}

View File

@@ -0,0 +1,145 @@
namespace UglyToad.Pdf.Filters
{
using System;
using System.IO;
using ContentStream;
/// <summary>
/// ASCII 85 (Base85) is a binary to text encoding using 5 ASCII characters per 4 bytes of data.
/// </summary>
public class Ascii85Filter : IFilter
{
private const byte EmptyBlock = (byte)'z';
private const byte Offset = (byte)'!';
private const byte EmptyCharacterPadding = (byte) 'u';
private static readonly byte[] EndOfDataBytes = { (byte)'~', (byte)'>' };
private static readonly int[] PowerByIndex = {
1,
85,
85 * 85,
85 * 85 * 85,
85 * 85 * 85 *85
};
public byte[] Decode(byte[] input, PdfDictionary streamDictionary, int filterIndex)
{
var asciiBuffer = new byte[5];
var index = 0;
using (var stream = new MemoryStream())
using (var writer = new BinaryWriter(stream))
{
for (var i = 0; i < input.Length; i++)
{
var value = input[i];
if (IsWhiteSpace(value))
{
continue;
}
if (value == EndOfDataBytes[0])
{
if (i == input.Length - 1 || input[i + 1] == EndOfDataBytes[1])
{
if (index > 0)
{
WriteData(asciiBuffer, index, writer);
}
// The end
break;
}
// TODO: this shouldn't be possible?
}
if (value == EmptyBlock)
{
if (index > 0)
{
// Should I throw here?
}
for (int j = 0; j < 4; j++)
{
writer.Write(0);
}
index = 0;
// We've completed our block.
}
else
{
asciiBuffer[index] = (byte) (value - Offset);
index++;
}
if (index == 5)
{
WriteData(asciiBuffer, index, writer);
index = 0;
}
}
writer.Flush();
return stream.ToArray();
}
}
private static void WriteData(byte[] ascii, int index, BinaryWriter writer)
{
if (index < 2)
{
throw new ArgumentOutOfRangeException(nameof(index), "Cannot convert a block padded by 4 'u' characters.");
}
// Write any empty padding if the block ended early.
for (var i = index; i < 5; i++)
{
ascii[i] = EmptyCharacterPadding - Offset;
}
int value = 0;
value += ascii[0] * PowerByIndex[4];
value += ascii[1] * PowerByIndex[3];
value += ascii[2] * PowerByIndex[2];
value += ascii[3] * PowerByIndex[1];
value += ascii[4] * PowerByIndex[0];
writer.Write((byte)(value >> 24));
if (index > 2)
{
writer.Write((byte) (value >> 16));
}
if (index > 3)
{
writer.Write((byte) (value >> 8));
}
if (index > 4)
{
writer.Write((byte) value);
}
}
private static bool IsWhiteSpace(byte b)
{
if (b == '\r' || b == '\n' || b == ' ')
{
return true;
}
return false;
}
}
}

View File

@@ -21,11 +21,14 @@
public MemoryFilterProvider(IDecodeParameterResolver decodeParameterResolver, IPngPredictor pngPredictor, ILog log)
{
IFilter FlateFunc() => new FlateFilter(decodeParameterResolver, pngPredictor, log);
IFilter Ascii85Func() => new Ascii85Filter();
filterFactories = new Dictionary<CosName, Func<IFilter>>
{
{CosName.FLATE_DECODE, FlateFunc},
{CosName.FLATE_DECODE_ABBREVIATION, FlateFunc},
{CosName.ASCII85_DECODE, Ascii85Func},
{CosName.ASCII85_DECODE_ABBREVIATION, Ascii85Func}
};
}