diff --git a/src/UglyToad.Pdf/Content/Page.cs b/src/UglyToad.Pdf/Content/Page.cs index 8da72d2f..ae86baa6 100644 --- a/src/UglyToad.Pdf/Content/Page.cs +++ b/src/UglyToad.Pdf/Content/Page.cs @@ -87,16 +87,15 @@ } var contents = contentStream.Decode(parsingArguments.Container.Get()); - - if (Debugger.IsAttached) - { - var textContents = OtherEncodings.BytesAsLatin1String(contents); - } - + var operations = parsingArguments.Container.Get() .Parse(parsingArguments.Container.Get(), new ByteArrayInputBytes(contents)); + var context = new ContentStreamProcessor(MediaBox.Bounds, parsingArguments.CachingProviders.ResourceContainer); + var content = context.Process(operations); + + Content = content; } } } diff --git a/src/UglyToad.Pdf/Content/PageContent.cs b/src/UglyToad.Pdf/Content/PageContent.cs index ca199d68..77653def 100644 --- a/src/UglyToad.Pdf/Content/PageContent.cs +++ b/src/UglyToad.Pdf/Content/PageContent.cs @@ -13,5 +13,7 @@ public class PageContent { internal IReadOnlyList GraphicsStateOperations { get; set; } + + public IReadOnlyList Text { get; set; } } } diff --git a/src/UglyToad.Pdf/Content/ResourceContainer.cs b/src/UglyToad.Pdf/Content/ResourceContainer.cs index 88ecee28..9d059502 100644 --- a/src/UglyToad.Pdf/Content/ResourceContainer.cs +++ b/src/UglyToad.Pdf/Content/ResourceContainer.cs @@ -4,7 +4,11 @@ using System.Collections.Generic; using ContentStream; using Cos; + using Filters; using Fonts; + using Fonts.Cmap; + using Fonts.Parser; + using IO; using Parser; internal interface IResourceStore @@ -47,7 +51,35 @@ var fontObject = dynamicParser.Parse(arguments, objectKey, false) as ContentStreamDictionary; - var font = new CompositeFont(); + if (fontObject == null) + { + throw new InvalidOperationException($"Could not retrieve the font with name: {pair.Key} which should have been object {objectKey.GetObjectNumber()}"); + } + + CMap toUnicodeCMap = null; + if (fontObject.ContainsKey(CosName.TO_UNICODE)) + { + var toUnicodeValue = fontObject[CosName.TO_UNICODE]; + + var toUnicode = dynamicParser.Parse(arguments, toUnicodeValue as CosObject, false) as RawCosStream; + + var decodedUnicodeCMap = toUnicode?.Decode(arguments.Container.Get()); + + if (decodedUnicodeCMap != null) + { + toUnicodeCMap = arguments.Container.Get() + .Parse(new ByteArrayInputBytes(decodedUnicodeCMap), arguments.IsLenientParsing); + } + + + } + + var font = new CompositeFont + { + Name = pair.Key, + SubType = fontObject.GetName(CosName.SUBTYPE), + ToUnicode = toUnicodeCMap + }; loadedFonts[pair.Key] = font; } @@ -55,7 +87,9 @@ public IFont GetFont(CosName name) { - throw new NotImplementedException(); + loadedFonts.TryGetValue(name, out var font); + + return font; } } } diff --git a/src/UglyToad.Pdf/Fonts/Cmap/Cmap.cs b/src/UglyToad.Pdf/Fonts/Cmap/Cmap.cs index 184f948d..785c8faa 100644 --- a/src/UglyToad.Pdf/Fonts/Cmap/Cmap.cs +++ b/src/UglyToad.Pdf/Fonts/Cmap/Cmap.cs @@ -2,6 +2,8 @@ { using System; using System.Collections.Generic; + using System.Linq; + using IO; using Util.JetBrains.Annotations; public class CMap @@ -32,6 +34,9 @@ public bool HasUnicodeMappings => BaseFontCharacterMap.Count > 0; + private readonly int minCodeLength = 4; + private readonly int maxCodeLength; + public CMap(CharacterIdentifierSystemInfo info, int type, int wMode, string name, string version, IReadOnlyDictionary baseFontCharacterMap, IReadOnlyList codespaceRanges, IReadOnlyList cidRanges, IReadOnlyList cidCharacterMappings) { Info = info; @@ -43,6 +48,8 @@ CodespaceRanges = codespaceRanges ?? throw new ArgumentNullException(nameof(codespaceRanges)); CidRanges = cidRanges ?? throw new ArgumentNullException(nameof(cidRanges)); CidCharacterMappings = cidCharacterMappings ?? throw new ArgumentNullException(nameof(cidCharacterMappings)); + maxCodeLength = CodespaceRanges.Max(x => x.CodeLength); + minCodeLength = CodespaceRanges.Min(x => x.CodeLength); } private int wmode = 0; @@ -54,8 +61,6 @@ private string ordering = null; private int supplement = 0; - private int minCodeLength = 4; - private int maxCodeLength; // CID mappings private readonly Dictionary codeToCid = new Dictionary(); @@ -77,38 +82,6 @@ return found; } - /** - * Reads a character code from a string in the content stream. - *

See "CMap Mapping" and "Handling Undefined Characters" in PDF32000 for more details. - * - * @param in string stream - * @return character code - * @throws IOException if there was an error reading the stream or CMap - */ - //public int readCode(InputStream input) - //{ - // byte[] bytes = new byte[maxCodeLength]; - // input.read(bytes, 0, minCodeLength); - // for (int i = minCodeLength - 1; i < maxCodeLength; i++) - // { - // var byteCount = i + 1; - // foreach (var range in codespaceRanges) - // { - // if (range.isFullMatch(bytes, byteCount)) - // { - // return toInt(bytes, byteCount); - // } - // } - // if (byteCount < maxCodeLength) - // { - // bytes[byteCount] = (byte)input.read(); - // } - // } - - // throw new InvalidOperationException("CMap is invalid"); - //} - - /** * Returns the CID for the given character code. * @@ -139,6 +112,57 @@ { return cmapName; } - } + public int ReadCode(IInputBytes bytes) + { + byte[] result = new byte[maxCodeLength]; + + result[0] = bytes.CurrentByte; + + for (int i = 1; i < minCodeLength; i++) + { + result[i] = ReadByte(bytes); + } + + for (int i = minCodeLength - 1; i < maxCodeLength; i++) + { + int byteCount = i + 1; + foreach (CodespaceRange range in CodespaceRanges) + { + if (range.isFullMatch(result, byteCount)) + { + return ByteArrayToInt(result, byteCount); + } + } + if (byteCount < maxCodeLength) + { + result[byteCount] = ReadByte(bytes); + } + } + + throw new InvalidOperationException("CMap is invalid"); + } + + private static byte ReadByte(IInputBytes bytes) + { + if (!bytes.MoveNext()) + { + throw new InvalidOperationException("Read byte called on input bytes which was at end of byte set. Current offset: " + bytes.CurrentOffset); + } + + return bytes.CurrentByte; + } + + private static int ByteArrayToInt(byte[] data, int dataLen) + { + int code = 0; + for (int i = 0; i < dataLen; ++i) + { + code <<= 8; + code |= (data[i] & 0xFF); + } + return code; + } + + } } diff --git a/src/UglyToad.Pdf/Fonts/IFont.cs b/src/UglyToad.Pdf/Fonts/IFont.cs index 960f8e1f..212547f3 100644 --- a/src/UglyToad.Pdf/Fonts/IFont.cs +++ b/src/UglyToad.Pdf/Fonts/IFont.cs @@ -1,5 +1,7 @@ namespace UglyToad.Pdf.Fonts { + using System; + using System.Collections.Generic; using Cmap; using Cos; using Geometry; @@ -24,28 +26,53 @@ internal class CompositeFont : IFont { - public CosName Name { get; } + private readonly Dictionary codeToWidthMap = new Dictionary(); - public CosName SubType { get; } + public CosName Name { get; set; } + + public CosName SubType { get; set; } public string BaseFontType { get; } + public bool IsVertical { get; } - public CMap ToUnicode { get; } + public CMap ToUnicode { get; set; } public int ReadCharacterCode(IInputBytes bytes, out int codeLength) { - throw new System.NotImplementedException(); + var current = bytes.CurrentOffset; + + var code = ToUnicode.ReadCode(bytes); + + codeLength = bytes.CurrentOffset - current; + + return code; } public string GetUnicode(int characterCode) { - throw new System.NotImplementedException(); + if (ToUnicode != null) + { + if (ToUnicode.TryConvertToUnicode(characterCode, out string s)) return s; + } + + throw new NotImplementedException($"Could not locate the unicode for the character code {characterCode} in font {Name}."); } public PdfVector GetDisplacement(int characterCode) { - throw new System.NotImplementedException(); + var width = GetCharacterWidth(characterCode); + return new PdfVector(width / 1000, 0); + } + + private decimal GetCharacterWidth(int characterCode) + { + if (codeToWidthMap.TryGetValue(characterCode, out var width)) + { + return width; + } + + return 12000; } } } diff --git a/src/UglyToad.Pdf/Graphics/ContentStreamProcessor.cs b/src/UglyToad.Pdf/Graphics/ContentStreamProcessor.cs index 5cd9fbfb..3c7b6332 100644 --- a/src/UglyToad.Pdf/Graphics/ContentStreamProcessor.cs +++ b/src/UglyToad.Pdf/Graphics/ContentStreamProcessor.cs @@ -2,6 +2,7 @@ { using System; using System.Collections.Generic; + using System.Diagnostics; using Content; using Fonts; using Geometry; @@ -19,9 +20,12 @@ public int StackSize => graphicsStack.Count; + public List Texts = new List(); + public ContentStreamProcessor(PdfRectangle cropBox, IResourceStore resourceStore) { this.resourceStore = resourceStore; + graphicsStack.Push(new CurrentGraphicsState()); } public PageContent Process(IReadOnlyList operations) @@ -30,7 +34,11 @@ ProcessOperations(operations); - return new PageContent(); + return new PageContent + { + GraphicsStateOperations = operations, + Text = Texts + }; } private void ProcessOperations(IReadOnlyList operations) @@ -49,6 +57,7 @@ return saved; } + [DebuggerStepThrough] public CurrentGraphicsState GetCurrentState() { return graphicsStack.Peek(); @@ -116,7 +125,7 @@ private void ShowGlyph(TransformationMatrix renderingMatrix, IFont font, int characterCode, string unicode, PdfVector displacement) { - throw new NotImplementedException(); + Texts.Add(unicode); } } } \ No newline at end of file diff --git a/src/UglyToad.Pdf/Graphics/CurrentGraphicsState.cs b/src/UglyToad.Pdf/Graphics/CurrentGraphicsState.cs index a68be7ae..88cc222f 100644 --- a/src/UglyToad.Pdf/Graphics/CurrentGraphicsState.cs +++ b/src/UglyToad.Pdf/Graphics/CurrentGraphicsState.cs @@ -15,7 +15,7 @@ ///

/// The for this graphics state. /// - public CurrentFontState FontState { get; set; } + public CurrentFontState FontState { get; set; } = new CurrentFontState(); /// /// Thickness in user space units of path to be stroked. diff --git a/src/UglyToad.Pdf/Graphics/ReflectionGraphicsStateOperationFactory.cs b/src/UglyToad.Pdf/Graphics/ReflectionGraphicsStateOperationFactory.cs index c5b55d3f..6778cfcd 100644 --- a/src/UglyToad.Pdf/Graphics/ReflectionGraphicsStateOperationFactory.cs +++ b/src/UglyToad.Pdf/Graphics/ReflectionGraphicsStateOperationFactory.cs @@ -2,9 +2,11 @@ namespace UglyToad.Pdf.Graphics { using System; using System.Collections.Generic; + using System.Linq; using System.Reflection; using Cos; using Operations; + using Operations.TextShowing; using Tokenization.Tokens; internal class ReflectionGraphicsStateOperationFactory : IGraphicsStateOperationFactory @@ -39,6 +41,28 @@ namespace UglyToad.Pdf.Graphics public IGraphicsStateOperation Create(OperatorToken op, IReadOnlyList operands) { + switch (op.Data) + { + case ShowText.Symbol: + if (operands.Count != 1) + { + throw new InvalidOperationException($"Attempted to create a show text operation with {operands.Count} operands."); + } + + if (operands[0] is StringToken s) + { + return new ShowText(s.Data); + } + else if (operands[0] is HexToken h) + { + return new ShowText(h.Bytes.ToArray()); + } + else + { + throw new InvalidOperationException($"Tried to create a show text operation with operand type: {operands[0]?.GetType().Name ?? "null"}"); + } + } + if (!operations.TryGetValue(op.Data, out Type operationType)) { return null; diff --git a/src/UglyToad.Pdf/Util/Bootstrapper.cs b/src/UglyToad.Pdf/Util/Bootstrapper.cs index 686fcb1a..54328572 100644 --- a/src/UglyToad.Pdf/Util/Bootstrapper.cs +++ b/src/UglyToad.Pdf/Util/Bootstrapper.cs @@ -1,6 +1,7 @@ namespace UglyToad.Pdf.Util { using Filters; + using Fonts.Parser; using Graphics; using Logging; using Parser; @@ -51,6 +52,8 @@ var pageContentParser = new PageContentParser(); var operationFactory = new ReflectionGraphicsStateOperationFactory(); + var cmapParser = new CMapParser(); + var container = new Container(); container.Register(headerParser); container.Register(trailerParser); @@ -70,6 +73,7 @@ container.Register(fontParser); container.Register(pageContentParser); container.Register(operationFactory); + container.Register(cmapParser); return container; }