add test for non latin characters and use normal ints rather than octal in the encoding classes.

This commit is contained in:
Eliot Jones
2018-01-01 13:49:24 +00:00
parent 874f713566
commit c34bdac92a
15 changed files with 302 additions and 109 deletions

View File

@@ -0,0 +1,55 @@
namespace UglyToad.Pdf.Tests.Integration
{
using System;
using System.IO;
using System.Linq;
using Content;
using Xunit;
public class SinglePageNonLatinAcrobatDistillerTests
{
private static string GetFilename()
{
var documentFolder = Path.GetFullPath(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "..", "..", "..", "Integration", "Documents"));
return Path.Combine(documentFolder, "Single Page Non Latin - from acrobat distiller.pdf");
}
[Fact]
public void HasCorrectNumberOfPages()
{
var file = GetFilename();
using (var document = PdfDocument.Open(File.ReadAllBytes(file)))
{
Assert.Equal(1, document.NumberOfPages);
}
}
[Fact]
public void HasCorrectPageSize()
{
using (var document = PdfDocument.Open(GetFilename()))
{
var page = document.GetPage(1);
Assert.Equal(PageSize.Letter, page.Size);
}
}
[Fact]
public void GetsCorrectPageTextIgnoringHiddenCharacters()
{
using (var document = PdfDocument.Open(GetFilename()))
{
var page = document.GetPage(1);
var text = string.Join(string.Empty, page.Letters.Select(x => x.Value));
// For some reason the C# string reverses these characters but they are extracted correctly.
// TODO: Need someone who can read these to check them
Assert.Equal("Hello ﺪﻤﺤﻣ World. ", text);
}
}
}
}

View File

@@ -2,6 +2,7 @@
{
using System;
using System.IO;
using System.Linq;
using Content;
using Xunit;
@@ -35,5 +36,18 @@
Assert.Equal(PageSize.Letter, page.Size);
}
}
[Fact]
public void GetsCorrectPageTextIgnoringHiddenCharacters()
{
using (var document = PdfDocument.Open(GetFilename()))
{
var page = document.GetPage(1);
var text = string.Join(string.Empty, page.Letters.Select(x => x.Value));
Assert.Equal("I am a simple pdf.", text);
}
}
}
}

View File

@@ -12,6 +12,7 @@
<None Remove="Fonts\TrueType\Roboto-Regular.ttf" />
<None Remove="Integration\Documents\Font Size Test - from google chrome print pdf.pdf" />
<None Remove="Integration\Documents\Font Size Test - from libre office.pdf" />
<None Remove="Integration\Documents\Single Page Non Latin - from acrobat distiller.pdf" />
<None Remove="Integration\Documents\Single Page Simple - from google drive.pdf" />
<None Remove="Integration\Documents\Single Page Simple - from open office.pdf" />
</ItemGroup>
@@ -29,6 +30,9 @@
<Content Include="Integration\Documents\Font Size Test - from google chrome print pdf.pdf">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</Content>
<Content Include="Integration\Documents\Single Page Non Latin - from acrobat distiller.pdf">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</Content>
<Content Include="Integration\Documents\Single Page Simple - from google drive.pdf">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</Content>

View File

@@ -37,63 +37,63 @@ namespace UglyToad.Pdf.Cos
}
set(i, (char)i, temporaryMap);
Set(i, (char)i, temporaryMap);
}
// then do all deviations (based on the table in ISO 32000-1:2008)
// block 1
set(0x18, '\u02D8', temporaryMap); // BREVE
set(0x19, '\u02C7', temporaryMap); // CARON
set(0x1A, '\u02C6', temporaryMap); // MODIFIER LETTER CIRCUMFLEX ACCENT
set(0x1B, '\u02D9', temporaryMap); // DOT ABOVE
set(0x1C, '\u02DD', temporaryMap); // DOUBLE ACUTE ACCENT
set(0x1D, '\u02DB', temporaryMap); // OGONEK
set(0x1E, '\u02DA', temporaryMap); // RING ABOVE
set(0x1F, '\u02DC', temporaryMap); // SMALL TILDE
Set(0x18, '\u02D8', temporaryMap); // BREVE
Set(0x19, '\u02C7', temporaryMap); // CARON
Set(0x1A, '\u02C6', temporaryMap); // MODIFIER LETTER CIRCUMFLEX ACCENT
Set(0x1B, '\u02D9', temporaryMap); // DOT ABOVE
Set(0x1C, '\u02DD', temporaryMap); // DOUBLE ACUTE ACCENT
Set(0x1D, '\u02DB', temporaryMap); // OGONEK
Set(0x1E, '\u02DA', temporaryMap); // RING ABOVE
Set(0x1F, '\u02DC', temporaryMap); // SMALL TILDE
// block 2
set(0x7F, ReplacementCharacter, temporaryMap); // undefined
set(0x80, '\u2022', temporaryMap); // BULLET
set(0x81, '\u2020', temporaryMap); // DAGGER
set(0x82, '\u2021', temporaryMap); // DOUBLE DAGGER
set(0x83, '\u2026', temporaryMap); // HORIZONTAL ELLIPSIS
set(0x84, '\u2014', temporaryMap); // EM DASH
set(0x85, '\u2013', temporaryMap); // EN DASH
set(0x86, '\u0192', temporaryMap); // LATIN SMALL LETTER SCRIPT F
set(0x87, '\u2044', temporaryMap); // FRACTION SLASH (solidus)
set(0x88, '\u2039', temporaryMap); // SINGLE LEFT-POINTING ANGLE QUOTATION MARK
set(0x89, '\u203A', temporaryMap); // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
set(0x8A, '\u2212', temporaryMap); // MINUS SIGN
set(0x8B, '\u2030', temporaryMap); // PER MILLE SIGN
set(0x8C, '\u201E', temporaryMap); // DOUBLE LOW-9 QUOTATION MARK (quotedblbase)
set(0x8D, '\u201C', temporaryMap); // LEFT DOUBLE QUOTATION MARK (quotedblleft)
set(0x8E, '\u201D', temporaryMap); // RIGHT DOUBLE QUOTATION MARK (quotedblright)
set(0x8F, '\u2018', temporaryMap); // LEFT SINGLE QUOTATION MARK (quoteleft)
set(0x90, '\u2019', temporaryMap); // RIGHT SINGLE QUOTATION MARK (quoteright)
set(0x91, '\u201A', temporaryMap); // SINGLE LOW-9 QUOTATION MARK (quotesinglbase)
set(0x92, '\u2122', temporaryMap); // TRADE MARK SIGN
set(0x93, '\uFB01', temporaryMap); // LATIN SMALL LIGATURE FI
set(0x94, '\uFB02', temporaryMap); // LATIN SMALL LIGATURE FL
set(0x95, '\u0141', temporaryMap); // LATIN CAPITAL LETTER L WITH STROKE
set(0x96, '\u0152', temporaryMap); // LATIN CAPITAL LIGATURE OE
set(0x97, '\u0160', temporaryMap); // LATIN CAPITAL LETTER S WITH CARON
set(0x98, '\u0178', temporaryMap); // LATIN CAPITAL LETTER Y WITH DIAERESIS
set(0x99, '\u017D', temporaryMap); // LATIN CAPITAL LETTER Z WITH CARON
set(0x9A, '\u0131', temporaryMap); // LATIN SMALL LETTER DOTLESS I
set(0x9B, '\u0142', temporaryMap); // LATIN SMALL LETTER L WITH STROKE
set(0x9C, '\u0153', temporaryMap); // LATIN SMALL LIGATURE OE
set(0x9D, '\u0161', temporaryMap); // LATIN SMALL LETTER S WITH CARON
set(0x9E, '\u017E', temporaryMap); // LATIN SMALL LETTER Z WITH CARON
set(0x9F, ReplacementCharacter, temporaryMap); // undefined
set(0xA0, '\u20AC', temporaryMap); // EURO SIGN
Set(0x7F, ReplacementCharacter, temporaryMap); // undefined
Set(0x80, '\u2022', temporaryMap); // BULLET
Set(0x81, '\u2020', temporaryMap); // DAGGER
Set(0x82, '\u2021', temporaryMap); // DOUBLE DAGGER
Set(0x83, '\u2026', temporaryMap); // HORIZONTAL ELLIPSIS
Set(0x84, '\u2014', temporaryMap); // EM DASH
Set(0x85, '\u2013', temporaryMap); // EN DASH
Set(0x86, '\u0192', temporaryMap); // LATIN SMALL LETTER SCRIPT F
Set(0x87, '\u2044', temporaryMap); // FRACTION SLASH (solidus)
Set(0x88, '\u2039', temporaryMap); // SINGLE LEFT-POINTING ANGLE QUOTATION MARK
Set(0x89, '\u203A', temporaryMap); // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
Set(0x8A, '\u2212', temporaryMap); // MINUS SIGN
Set(0x8B, '\u2030', temporaryMap); // PER MILLE SIGN
Set(0x8C, '\u201E', temporaryMap); // DOUBLE LOW-9 QUOTATION MARK (quotedblbase)
Set(0x8D, '\u201C', temporaryMap); // LEFT DOUBLE QUOTATION MARK (quotedblleft)
Set(0x8E, '\u201D', temporaryMap); // RIGHT DOUBLE QUOTATION MARK (quotedblright)
Set(0x8F, '\u2018', temporaryMap); // LEFT SINGLE QUOTATION MARK (quoteleft)
Set(0x90, '\u2019', temporaryMap); // RIGHT SINGLE QUOTATION MARK (quoteright)
Set(0x91, '\u201A', temporaryMap); // SINGLE LOW-9 QUOTATION MARK (quotesinglbase)
Set(0x92, '\u2122', temporaryMap); // TRADE MARK SIGN
Set(0x93, '\uFB01', temporaryMap); // LATIN SMALL LIGATURE FI
Set(0x94, '\uFB02', temporaryMap); // LATIN SMALL LIGATURE FL
Set(0x95, '\u0141', temporaryMap); // LATIN CAPITAL LETTER L WITH STROKE
Set(0x96, '\u0152', temporaryMap); // LATIN CAPITAL LIGATURE OE
Set(0x97, '\u0160', temporaryMap); // LATIN CAPITAL LETTER S WITH CARON
Set(0x98, '\u0178', temporaryMap); // LATIN CAPITAL LETTER Y WITH DIAERESIS
Set(0x99, '\u017D', temporaryMap); // LATIN CAPITAL LETTER Z WITH CARON
Set(0x9A, '\u0131', temporaryMap); // LATIN SMALL LETTER DOTLESS I
Set(0x9B, '\u0142', temporaryMap); // LATIN SMALL LETTER L WITH STROKE
Set(0x9C, '\u0153', temporaryMap); // LATIN SMALL LIGATURE OE
Set(0x9D, '\u0161', temporaryMap); // LATIN SMALL LETTER S WITH CARON
Set(0x9E, '\u017E', temporaryMap); // LATIN SMALL LETTER Z WITH CARON
Set(0x9F, ReplacementCharacter, temporaryMap); // undefined
Set(0xA0, '\u20AC', temporaryMap); // EURO SIGN
// end of deviations
UnicodeToCode = temporaryMap;
}
private static void set(int code, char unicode, Dictionary<char, int> unicodeToCode)
private static void Set(int code, char unicode, Dictionary<char, int> unicodeToCode)
{
CodeToUni[code] = unicode;
unicodeToCode.Add(unicode, code);
unicodeToCode[unicode] = code;
}
/**

View File

@@ -2,6 +2,7 @@
{
using System;
using System.Collections.Generic;
using Cos;
/// <summary>
/// Maps character codes to glyph names from a PostScript encoding.
@@ -55,5 +56,41 @@
NameToCode[name] = code;
}
}
public static bool TryGetNamedEncoding(CosName name, out Encoding encoding)
{
encoding = null;
if (name == null)
{
return false;
}
if (name.Equals(CosName.STANDARD_ENCODING))
{
encoding = StandardEncoding.Instance;
return true;
}
if (name.Equals(CosName.WIN_ANSI_ENCODING))
{
encoding = WinAnsiEncoding.Instance;
return true;
}
if (name.Equals(CosName.MAC_EXPERT_ENCODING))
{
encoding = MacExpertEncoding.Instance;
return true;
}
if (name.Equals(CosName.MAC_ROMAN_ENCODING))
{
encoding = MacRomanEncoding.Instance;
return true;
}
return false;
}
}
}

View File

@@ -1,5 +1,7 @@
namespace UglyToad.Pdf.Fonts.Encodings
{
using Util;
internal class MacExpertEncoding : Encoding
{
/// <summary>
@@ -182,7 +184,7 @@
{
foreach (var valueTuple in EncodingTable)
{
Add(valueTuple.Item1, valueTuple.Item2);
Add(OctalHelpers.FromOctalInt(valueTuple.Item1), valueTuple.Item2);
}
}
}

View File

@@ -1,5 +1,7 @@
namespace UglyToad.Pdf.Fonts.Encodings
{
using Util;
/// <summary>
/// Similar to the <see cref="MacRomanEncoding"/> with 15 additional entries.
/// </summary>
@@ -31,7 +33,7 @@
{
foreach (var valueTuple in EncodingTable)
{
Add(valueTuple.Item1, valueTuple.Item2);
Add(OctalHelpers.FromOctalInt(valueTuple.Item1), valueTuple.Item2);
}
}
}

View File

@@ -1,5 +1,7 @@
namespace UglyToad.Pdf.Fonts.Encodings
{
using Util;
internal class MacRomanEncoding : Encoding
{
/// <summary>
@@ -226,7 +228,7 @@
{
foreach (var valueTuple in EncodingTable)
{
Add(valueTuple.Item1, valueTuple.Item2);
Add(OctalHelpers.FromOctalInt(valueTuple.Item1), valueTuple.Item2);
}
}
}

View File

@@ -1,5 +1,7 @@
namespace UglyToad.Pdf.Fonts.Encodings
{
using Util;
internal class StandardEncoding : Encoding
{
private static readonly (int, string)[] EncodingTable =
@@ -163,7 +165,7 @@
{
foreach (var valueTuple in EncodingTable)
{
Add(valueTuple.Item1, valueTuple.Item2);
Add(OctalHelpers.FromOctalInt(valueTuple.Item1), valueTuple.Item2);
}
}
}

View File

@@ -1,7 +1,13 @@
namespace UglyToad.Pdf.Fonts.Encodings
{
using Util;
internal class WinAnsiEncoding : Encoding
{
/// <summary>
/// The encoding table is taken from the Appendix of the specification.
/// These codes are in octal.
/// </summary>
private static readonly (int, string)[] EncodingTable =
{
(0101, "A"),
@@ -233,7 +239,10 @@
{
foreach (var valueTuple in EncodingTable)
{
Add(valueTuple.Item1, valueTuple.Item2);
// Convert out of octal before creating
var code = OctalHelpers.FromOctalInt(valueTuple.Item1);
Add(code, valueTuple.Item2);
}
// In WinAnsiEncoding, all unused codes greater than 40 map to the bullet character.

View File

@@ -1,9 +1,11 @@
namespace UglyToad.Pdf.Fonts.Parser.Handlers
{
using System;
using System.Linq;
using Cmap;
using ContentStream;
using Cos;
using Encodings;
using Exceptions;
using Filters;
using IO;
@@ -60,6 +62,33 @@
}
}
Encoding encoding = null;
if (dictionary.TryGetValue(CosName.ENCODING, out var encodingBase))
{
// Symbolic fonts default to standard encoding.
if (descriptor.Flags.HasFlag(FontFlags.Symbolic))
{
encoding = StandardEncoding.Instance;
}
if (encodingBase is CosName encodingName)
{
if (!Encoding.TryGetNamedEncoding(encodingName, out encoding))
{
// TODO: PDFBox would not throw here.
throw new InvalidFontFormatException($"Unrecognised encoding name: {encodingName}");
}
}
else if (encodingBase is CosDictionary encodingDictionary)
{
throw new NotImplementedException("No support for reading encoding from dictionary yet.");
}
else
{
throw new NotImplementedException("No support for reading encoding from font yet.");
}
}
return new TrueTypeSimpleFont(name, firstCharacter, lastCharacter, widths, descriptor, toUnicodeCMap);
}

View File

@@ -1,5 +1,6 @@
namespace UglyToad.Pdf.Fonts.Simple
{
using System;
using Cmap;
using Composite;
using Cos;
@@ -47,7 +48,18 @@
if (!ToUnicode.CanMapToUnicode)
{
return false;
// For now just cast to character
try
{
value = ((char) characterCode).ToString();
return true;
}
catch (Exception)
{
return false;
}
}
return ToUnicode.TryGet(characterCode, out value);

View File

@@ -4,6 +4,7 @@
using IO;
using Parser.Parts;
using Tokens;
using Util;
public class StringTokenizer : ITokenizer
{
@@ -51,7 +52,7 @@
if (octalsRead == 3 || !nextCharacterOctal)
{
var characterCode = FromOctal(octal);
var characterCode = OctalHelpers.FromOctalDigits(octal);
// For now :(
// TODO: I have a sneaking suspicion this is wrong, not sure what behaviour is for large octal numbers
@@ -159,7 +160,7 @@
octals[i] = octals[i - 1];
}
var value = OctalCharacterToShort(nextOctalChar);
var value = nextOctalChar.CharacterToShort();
octals[0] = value;
}
@@ -220,7 +221,7 @@
case '5':
case '6':
case '7':
octal[0] = OctalCharacterToShort(c);
octal[0] = c.CharacterToShort();
isOctalActive = true;
octalsRead = 1;
break;
@@ -242,60 +243,5 @@
break;
}
}
private static short OctalCharacterToShort(char c)
{
switch (c)
{
case '0':
return 0;
case '1':
return 1;
case '2':
return 2;
case '3':
return 3;
case '4':
return 4;
case '5':
return 5;
case '6':
return 6;
case '7':
return 7;
case '8':
return 8;
case '9':
return 9;
default:
return 0;
}
}
private static int FromOctal(short[] octal)
{
int Power(int x, int pow)
{
int ret = 1;
while (pow != 0)
{
if ((pow & 1) == 1)
ret *= x;
x *= x;
pow >>= 1;
}
return ret;
}
int sum = 0;
for (int i = octal.Length - 1; i >= 0; i--)
{
var power = i;
sum += octal[i] * Power(8, power);
}
return sum;
}
}
}

View File

@@ -0,0 +1,79 @@
namespace UglyToad.Pdf.Util
{
using System;
internal static class OctalHelpers
{
public static short CharacterToShort(this char c)
{
switch (c)
{
case '0':
return 0;
case '1':
return 1;
case '2':
return 2;
case '3':
return 3;
case '4':
return 4;
case '5':
return 5;
case '6':
return 6;
case '7':
return 7;
case '8':
return 8;
case '9':
return 9;
default:
throw new InvalidOperationException($"Could not convert the character {c} to a short.");
}
}
public static int FromOctalDigits(short[] octal)
{
int sum = 0;
for (int i = octal.Length - 1; i >= 0; i--)
{
var power = i;
sum += octal[i] * QuickPower(8, power);
}
return sum;
}
public static int FromOctalInt(int input)
{
var str = input.ToString();
int sum = 0;
for (var i = str.Length - 1; i >= 0; i--)
{
var part = str[i].CharacterToShort();
sum += part * QuickPower(8, i);
}
return sum;
}
private static int QuickPower(int x, int pow)
{
int ret = 1;
while (pow != 0)
{
if ((pow & 1) == 1)
ret *= x;
x *= x;
pow >>= 1;
}
return ret;
}
}
}