skip single letter final blocks

align with the behavior of pdfbox and c implementations where
single character final blocks are ignored rather than being written.
also makes the error more informative in case it is ever encountered
again.

add more test cases.

it is possible this is hiding the problem and will move the error elsewhere
but this matches the implementation behavior of the 2 reference
implementations. one other potential source for the error is if pdf supports
'<~' as a start of data marker which i can't find in the spec but wikipedia
says might be possible? without documents to trigger the error i think
this is the best fix for now
This commit is contained in:
EliotJones 2025-07-08 18:14:29 -05:00 committed by BobLd
parent 781991b6bf
commit 7fe60ff8c3
2 changed files with 60 additions and 15 deletions

View File

@ -34,10 +34,39 @@ O<DJ+*.@<*K0@<6L(Df-\0Ec5e;DffZ(EZee.Bl.9pF""AGXBPCsi + DGm >@3BB / F * &OCAfu2
text);
}
[Fact]
public void ReplacesZWithEmptyBytes()
[Theory]
[InlineData("BE", "h")]
[InlineData("BOq", "he")]
[InlineData("BOtu", "hel")]
[InlineData("BOu!r", "hell")]
[InlineData("BOu!rDZ", "hello")]
[InlineData("BOu!rD]f", "hello ")]
[InlineData("BOu!rD]j6", "hello w")]
[InlineData("BOu!rD]j7B", "hello wo")]
[InlineData("BOu!rD]j7BEW", "hello wor")]
[InlineData("BOu!rD]j7BEbk", "hello worl")]
[InlineData("BOu!rD]j7BEbo7", "hello world")]
[InlineData("BOu!rD]j7BEbo80", "hello world!")]
public void DecodesHelloWorld(string encoded, string decoded)
{
var bytes = Encoding.ASCII.GetBytes("9jqo^zBlbD-");
var result = filter.Decode(
Encoding.ASCII.GetBytes(encoded),
dictionary,
TestFilterProvider.Instance,
0);
Assert.Equal(decoded, Encoding.ASCII.GetString(result.ToArray()));
}
[Theory]
[InlineData("9jqo^zBlbD-", "Man \0\0\0\0is d")]
[InlineData("", "")]
[InlineData("z", "\0\0\0\0")]
[InlineData("zz", "\0\0\0\0\0\0\0\0")]
[InlineData("zzz", "\0\0\0\0\0\0\0\0\0\0\0\0")]
public void ReplacesZWithEmptyBytes(string encoded, string decoded)
{
var bytes = Encoding.ASCII.GetBytes(encoded);
var result = filter.Decode(bytes, dictionary, TestFilterProvider.Instance, 1);
@ -47,7 +76,7 @@ O<DJ+*.@<*K0@<6L(Df-\0Ec5e;DffZ(EZee.Bl.9pF""AGXBPCsi + DGm >@3BB / F * &OCAfu2
string text = Encoding.ASCII.GetString(result.Span);
#endif
Assert.Equal("Man \0\0\0\0is d", text);
Assert.Equal(decoded, text);
}
[Fact]
@ -60,14 +89,17 @@ O<DJ+*.@<*K0@<6L(Df-\0Ec5e;DffZ(EZee.Bl.9pF""AGXBPCsi + DGm >@3BB / F * &OCAfu2
Assert.Throws<InvalidOperationException>(action);
}
[Fact]
public void SingleCharacterLastThrows()
[Theory]
[InlineData("@rH:%B", "cool")]
[InlineData("A~>", "")]
[InlineData("@rH:%A~>", "cool")]
public void SingleCharacterLastIgnores(string encoded, string decoded)
{
var bytes = Encoding.ASCII.GetBytes("9jqo^B");
var bytes = Encoding.ASCII.GetBytes(encoded);
Action action = () => filter.Decode(bytes, dictionary, TestFilterProvider.Instance, 1);
var result = filter.Decode(bytes, dictionary, TestFilterProvider.Instance, 1);
Assert.Throws<ArgumentOutOfRangeException>(action);
Assert.Equal(decoded, Encoding.ASCII.GetString(result.ToArray()));
}
private const string PdfContent = @"1 0 obj

View File

@ -2,6 +2,7 @@
{
using System;
using Core;
using System.Text;
using Tokens;
/// <summary>
@ -13,7 +14,7 @@
private const byte Offset = (byte)'!';
private const byte EmptyCharacterPadding = (byte)'u';
private static ReadOnlySpan<byte> EndOfDataBytes => [(byte)'~', (byte)'>'];
private static ReadOnlySpan<byte> EndOfDataBytes => "~>"u8;
private static readonly int[] PowerByIndex =
[
@ -52,7 +53,7 @@
{
if (index > 0)
{
WriteData(asciiBuffer, index, writer);
WriteData(asciiBuffer, index, writer, true);
}
index = 0;
@ -88,24 +89,36 @@
if (index == 5)
{
WriteData(asciiBuffer, index, writer);
WriteData(asciiBuffer, index, writer, false);
index = 0;
}
}
if (index > 0)
{
WriteData(asciiBuffer, index, writer);
WriteData(asciiBuffer, index, writer, true);
}
return writer.WrittenMemory.ToArray();
}
private static void WriteData(Span<byte> ascii, int index, ArrayPoolBufferWriter<byte> writer)
private static void WriteData(
Span<byte> ascii,
int index,
ArrayPoolBufferWriter<byte> writer,
bool isAtEnd)
{
if (index < 2)
{
throw new ArgumentOutOfRangeException(nameof(index), "Cannot convert a block padded by 4 'u' characters.");
if (isAtEnd)
{
return;
}
var bufferTxt = Encoding.ASCII.GetString(ascii);
var soFar = Encoding.ASCII.GetString(writer.GetSpan());
throw new ArgumentOutOfRangeException(nameof(index),
$"Cannot convert a this block because we're not at the end of the stream. Chunk: '{bufferTxt}'. Content: '{soFar}'");
}
// Write any empty padding if the block ended early.