get the text out of the google docs document finally

This commit is contained in:
Eliot Jones
2017-12-02 19:08:38 +00:00
parent 257439b8a3
commit 3d082ef236
9 changed files with 175 additions and 52 deletions

View File

@@ -87,16 +87,15 @@
}
var contents = contentStream.Decode(parsingArguments.Container.Get<IFilterProvider>());
if (Debugger.IsAttached)
{
var textContents = OtherEncodings.BytesAsLatin1String(contents);
}
var operations = parsingArguments.Container.Get<PageContentParser>()
.Parse(parsingArguments.Container.Get<IGraphicsStateOperationFactory>(), new ByteArrayInputBytes(contents));
var context = new ContentStreamProcessor(MediaBox.Bounds, parsingArguments.CachingProviders.ResourceContainer);
var content = context.Process(operations);
Content = content;
}
}
}

View File

@@ -13,5 +13,7 @@
public class PageContent
{
internal IReadOnlyList<IGraphicsStateOperation> GraphicsStateOperations { get; set; }
public IReadOnlyList<string> Text { get; set; }
}
}

View File

@@ -4,7 +4,11 @@
using System.Collections.Generic;
using ContentStream;
using Cos;
using Filters;
using Fonts;
using Fonts.Cmap;
using Fonts.Parser;
using IO;
using Parser;
internal interface IResourceStore
@@ -47,7 +51,35 @@
var fontObject = dynamicParser.Parse(arguments, objectKey, false) as ContentStreamDictionary;
var font = new CompositeFont();
if (fontObject == null)
{
throw new InvalidOperationException($"Could not retrieve the font with name: {pair.Key} which should have been object {objectKey.GetObjectNumber()}");
}
CMap toUnicodeCMap = null;
if (fontObject.ContainsKey(CosName.TO_UNICODE))
{
var toUnicodeValue = fontObject[CosName.TO_UNICODE];
var toUnicode = dynamicParser.Parse(arguments, toUnicodeValue as CosObject, false) as RawCosStream;
var decodedUnicodeCMap = toUnicode?.Decode(arguments.Container.Get<IFilterProvider>());
if (decodedUnicodeCMap != null)
{
toUnicodeCMap = arguments.Container.Get<CMapParser>()
.Parse(new ByteArrayInputBytes(decodedUnicodeCMap), arguments.IsLenientParsing);
}
}
var font = new CompositeFont
{
Name = pair.Key,
SubType = fontObject.GetName(CosName.SUBTYPE),
ToUnicode = toUnicodeCMap
};
loadedFonts[pair.Key] = font;
}
@@ -55,7 +87,9 @@
public IFont GetFont(CosName name)
{
throw new NotImplementedException();
loadedFonts.TryGetValue(name, out var font);
return font;
}
}
}

View File

@@ -2,6 +2,8 @@
{
using System;
using System.Collections.Generic;
using System.Linq;
using IO;
using Util.JetBrains.Annotations;
public class CMap
@@ -32,6 +34,9 @@
public bool HasUnicodeMappings => BaseFontCharacterMap.Count > 0;
private readonly int minCodeLength = 4;
private readonly int maxCodeLength;
public CMap(CharacterIdentifierSystemInfo info, int type, int wMode, string name, string version, IReadOnlyDictionary<int, string> baseFontCharacterMap, IReadOnlyList<CodespaceRange> codespaceRanges, IReadOnlyList<CidRange> cidRanges, IReadOnlyList<CidCharacterMapping> cidCharacterMappings)
{
Info = info;
@@ -43,6 +48,8 @@
CodespaceRanges = codespaceRanges ?? throw new ArgumentNullException(nameof(codespaceRanges));
CidRanges = cidRanges ?? throw new ArgumentNullException(nameof(cidRanges));
CidCharacterMappings = cidCharacterMappings ?? throw new ArgumentNullException(nameof(cidCharacterMappings));
maxCodeLength = CodespaceRanges.Max(x => x.CodeLength);
minCodeLength = CodespaceRanges.Min(x => x.CodeLength);
}
private int wmode = 0;
@@ -54,8 +61,6 @@
private string ordering = null;
private int supplement = 0;
private int minCodeLength = 4;
private int maxCodeLength;
// CID mappings
private readonly Dictionary<int, int> codeToCid = new Dictionary<int, int>();
@@ -77,38 +82,6 @@
return found;
}
/**
* Reads a character code from a string in the content stream.
* <p>See "CMap Mapping" and "Handling Undefined Characters" in PDF32000 for more details.
*
* @param in string stream
* @return character code
* @throws IOException if there was an error reading the stream or CMap
*/
//public int readCode(InputStream input)
//{
// byte[] bytes = new byte[maxCodeLength];
// input.read(bytes, 0, minCodeLength);
// for (int i = minCodeLength - 1; i < maxCodeLength; i++)
// {
// var byteCount = i + 1;
// foreach (var range in codespaceRanges)
// {
// if (range.isFullMatch(bytes, byteCount))
// {
// return toInt(bytes, byteCount);
// }
// }
// if (byteCount < maxCodeLength)
// {
// bytes[byteCount] = (byte)input.read();
// }
// }
// throw new InvalidOperationException("CMap is invalid");
//}
/**
* Returns the CID for the given character code.
*
@@ -139,6 +112,57 @@
{
return cmapName;
}
}
public int ReadCode(IInputBytes bytes)
{
byte[] result = new byte[maxCodeLength];
result[0] = bytes.CurrentByte;
for (int i = 1; i < minCodeLength; i++)
{
result[i] = ReadByte(bytes);
}
for (int i = minCodeLength - 1; i < maxCodeLength; i++)
{
int byteCount = i + 1;
foreach (CodespaceRange range in CodespaceRanges)
{
if (range.isFullMatch(result, byteCount))
{
return ByteArrayToInt(result, byteCount);
}
}
if (byteCount < maxCodeLength)
{
result[byteCount] = ReadByte(bytes);
}
}
throw new InvalidOperationException("CMap is invalid");
}
private static byte ReadByte(IInputBytes bytes)
{
if (!bytes.MoveNext())
{
throw new InvalidOperationException("Read byte called on input bytes which was at end of byte set. Current offset: " + bytes.CurrentOffset);
}
return bytes.CurrentByte;
}
private static int ByteArrayToInt(byte[] data, int dataLen)
{
int code = 0;
for (int i = 0; i < dataLen; ++i)
{
code <<= 8;
code |= (data[i] & 0xFF);
}
return code;
}
}
}

View File

@@ -1,5 +1,7 @@
namespace UglyToad.Pdf.Fonts
{
using System;
using System.Collections.Generic;
using Cmap;
using Cos;
using Geometry;
@@ -24,28 +26,53 @@
internal class CompositeFont : IFont
{
public CosName Name { get; }
private readonly Dictionary<int, decimal> codeToWidthMap = new Dictionary<int, decimal>();
public CosName SubType { get; }
public CosName Name { get; set; }
public CosName SubType { get; set; }
public string BaseFontType { get; }
public bool IsVertical { get; }
public CMap ToUnicode { get; }
public CMap ToUnicode { get; set; }
public int ReadCharacterCode(IInputBytes bytes, out int codeLength)
{
throw new System.NotImplementedException();
var current = bytes.CurrentOffset;
var code = ToUnicode.ReadCode(bytes);
codeLength = bytes.CurrentOffset - current;
return code;
}
public string GetUnicode(int characterCode)
{
throw new System.NotImplementedException();
if (ToUnicode != null)
{
if (ToUnicode.TryConvertToUnicode(characterCode, out string s)) return s;
}
throw new NotImplementedException($"Could not locate the unicode for the character code {characterCode} in font {Name}.");
}
public PdfVector GetDisplacement(int characterCode)
{
throw new System.NotImplementedException();
var width = GetCharacterWidth(characterCode);
return new PdfVector(width / 1000, 0);
}
private decimal GetCharacterWidth(int characterCode)
{
if (codeToWidthMap.TryGetValue(characterCode, out var width))
{
return width;
}
return 12000;
}
}
}

View File

@@ -2,6 +2,7 @@
{
using System;
using System.Collections.Generic;
using System.Diagnostics;
using Content;
using Fonts;
using Geometry;
@@ -19,9 +20,12 @@
public int StackSize => graphicsStack.Count;
public List<string> Texts = new List<string>();
public ContentStreamProcessor(PdfRectangle cropBox, IResourceStore resourceStore)
{
this.resourceStore = resourceStore;
graphicsStack.Push(new CurrentGraphicsState());
}
public PageContent Process(IReadOnlyList<IGraphicsStateOperation> operations)
@@ -30,7 +34,11 @@
ProcessOperations(operations);
return new PageContent();
return new PageContent
{
GraphicsStateOperations = operations,
Text = Texts
};
}
private void ProcessOperations(IReadOnlyList<IGraphicsStateOperation> operations)
@@ -49,6 +57,7 @@
return saved;
}
[DebuggerStepThrough]
public CurrentGraphicsState GetCurrentState()
{
return graphicsStack.Peek();
@@ -116,7 +125,7 @@
private void ShowGlyph(TransformationMatrix renderingMatrix, IFont font,
int characterCode, string unicode, PdfVector displacement)
{
throw new NotImplementedException();
Texts.Add(unicode);
}
}
}

View File

@@ -15,7 +15,7 @@
/// <summary>
/// The <see cref="CurrentFontState"/> for this graphics state.
/// </summary>
public CurrentFontState FontState { get; set; }
public CurrentFontState FontState { get; set; } = new CurrentFontState();
/// <summary>
/// Thickness in user space units of path to be stroked.

View File

@@ -2,9 +2,11 @@ namespace UglyToad.Pdf.Graphics
{
using System;
using System.Collections.Generic;
using System.Linq;
using System.Reflection;
using Cos;
using Operations;
using Operations.TextShowing;
using Tokenization.Tokens;
internal class ReflectionGraphicsStateOperationFactory : IGraphicsStateOperationFactory
@@ -39,6 +41,28 @@ namespace UglyToad.Pdf.Graphics
public IGraphicsStateOperation Create(OperatorToken op, IReadOnlyList<IToken> operands)
{
switch (op.Data)
{
case ShowText.Symbol:
if (operands.Count != 1)
{
throw new InvalidOperationException($"Attempted to create a show text operation with {operands.Count} operands.");
}
if (operands[0] is StringToken s)
{
return new ShowText(s.Data);
}
else if (operands[0] is HexToken h)
{
return new ShowText(h.Bytes.ToArray());
}
else
{
throw new InvalidOperationException($"Tried to create a show text operation with operand type: {operands[0]?.GetType().Name ?? "null"}");
}
}
if (!operations.TryGetValue(op.Data, out Type operationType))
{
return null;

View File

@@ -1,6 +1,7 @@
namespace UglyToad.Pdf.Util
{
using Filters;
using Fonts.Parser;
using Graphics;
using Logging;
using Parser;
@@ -51,6 +52,8 @@
var pageContentParser = new PageContentParser();
var operationFactory = new ReflectionGraphicsStateOperationFactory();
var cmapParser = new CMapParser();
var container = new Container();
container.Register(headerParser);
container.Register(trailerParser);
@@ -70,6 +73,7 @@
container.Register(fontParser);
container.Register(pageContentParser);
container.Register(operationFactory);
container.Register(cmapParser);
return container;
}