mirror of
https://github.com/UglyToad/PdfPig.git
synced 2026-03-10 00:23:29 +08:00
get the text out of the google docs document finally
This commit is contained in:
@@ -87,16 +87,15 @@
|
||||
}
|
||||
|
||||
var contents = contentStream.Decode(parsingArguments.Container.Get<IFilterProvider>());
|
||||
|
||||
if (Debugger.IsAttached)
|
||||
{
|
||||
var textContents = OtherEncodings.BytesAsLatin1String(contents);
|
||||
}
|
||||
|
||||
|
||||
var operations = parsingArguments.Container.Get<PageContentParser>()
|
||||
.Parse(parsingArguments.Container.Get<IGraphicsStateOperationFactory>(), new ByteArrayInputBytes(contents));
|
||||
|
||||
var context = new ContentStreamProcessor(MediaBox.Bounds, parsingArguments.CachingProviders.ResourceContainer);
|
||||
|
||||
var content = context.Process(operations);
|
||||
|
||||
Content = content;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -13,5 +13,7 @@
|
||||
public class PageContent
|
||||
{
|
||||
internal IReadOnlyList<IGraphicsStateOperation> GraphicsStateOperations { get; set; }
|
||||
|
||||
public IReadOnlyList<string> Text { get; set; }
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,7 +4,11 @@
|
||||
using System.Collections.Generic;
|
||||
using ContentStream;
|
||||
using Cos;
|
||||
using Filters;
|
||||
using Fonts;
|
||||
using Fonts.Cmap;
|
||||
using Fonts.Parser;
|
||||
using IO;
|
||||
using Parser;
|
||||
|
||||
internal interface IResourceStore
|
||||
@@ -47,7 +51,35 @@
|
||||
|
||||
var fontObject = dynamicParser.Parse(arguments, objectKey, false) as ContentStreamDictionary;
|
||||
|
||||
var font = new CompositeFont();
|
||||
if (fontObject == null)
|
||||
{
|
||||
throw new InvalidOperationException($"Could not retrieve the font with name: {pair.Key} which should have been object {objectKey.GetObjectNumber()}");
|
||||
}
|
||||
|
||||
CMap toUnicodeCMap = null;
|
||||
if (fontObject.ContainsKey(CosName.TO_UNICODE))
|
||||
{
|
||||
var toUnicodeValue = fontObject[CosName.TO_UNICODE];
|
||||
|
||||
var toUnicode = dynamicParser.Parse(arguments, toUnicodeValue as CosObject, false) as RawCosStream;
|
||||
|
||||
var decodedUnicodeCMap = toUnicode?.Decode(arguments.Container.Get<IFilterProvider>());
|
||||
|
||||
if (decodedUnicodeCMap != null)
|
||||
{
|
||||
toUnicodeCMap = arguments.Container.Get<CMapParser>()
|
||||
.Parse(new ByteArrayInputBytes(decodedUnicodeCMap), arguments.IsLenientParsing);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
var font = new CompositeFont
|
||||
{
|
||||
Name = pair.Key,
|
||||
SubType = fontObject.GetName(CosName.SUBTYPE),
|
||||
ToUnicode = toUnicodeCMap
|
||||
};
|
||||
|
||||
loadedFonts[pair.Key] = font;
|
||||
}
|
||||
@@ -55,7 +87,9 @@
|
||||
|
||||
public IFont GetFont(CosName name)
|
||||
{
|
||||
throw new NotImplementedException();
|
||||
loadedFonts.TryGetValue(name, out var font);
|
||||
|
||||
return font;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,6 +2,8 @@
|
||||
{
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using IO;
|
||||
using Util.JetBrains.Annotations;
|
||||
|
||||
public class CMap
|
||||
@@ -32,6 +34,9 @@
|
||||
|
||||
public bool HasUnicodeMappings => BaseFontCharacterMap.Count > 0;
|
||||
|
||||
private readonly int minCodeLength = 4;
|
||||
private readonly int maxCodeLength;
|
||||
|
||||
public CMap(CharacterIdentifierSystemInfo info, int type, int wMode, string name, string version, IReadOnlyDictionary<int, string> baseFontCharacterMap, IReadOnlyList<CodespaceRange> codespaceRanges, IReadOnlyList<CidRange> cidRanges, IReadOnlyList<CidCharacterMapping> cidCharacterMappings)
|
||||
{
|
||||
Info = info;
|
||||
@@ -43,6 +48,8 @@
|
||||
CodespaceRanges = codespaceRanges ?? throw new ArgumentNullException(nameof(codespaceRanges));
|
||||
CidRanges = cidRanges ?? throw new ArgumentNullException(nameof(cidRanges));
|
||||
CidCharacterMappings = cidCharacterMappings ?? throw new ArgumentNullException(nameof(cidCharacterMappings));
|
||||
maxCodeLength = CodespaceRanges.Max(x => x.CodeLength);
|
||||
minCodeLength = CodespaceRanges.Min(x => x.CodeLength);
|
||||
}
|
||||
|
||||
private int wmode = 0;
|
||||
@@ -54,8 +61,6 @@
|
||||
private string ordering = null;
|
||||
private int supplement = 0;
|
||||
|
||||
private int minCodeLength = 4;
|
||||
private int maxCodeLength;
|
||||
|
||||
// CID mappings
|
||||
private readonly Dictionary<int, int> codeToCid = new Dictionary<int, int>();
|
||||
@@ -77,38 +82,6 @@
|
||||
return found;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads a character code from a string in the content stream.
|
||||
* <p>See "CMap Mapping" and "Handling Undefined Characters" in PDF32000 for more details.
|
||||
*
|
||||
* @param in string stream
|
||||
* @return character code
|
||||
* @throws IOException if there was an error reading the stream or CMap
|
||||
*/
|
||||
//public int readCode(InputStream input)
|
||||
//{
|
||||
// byte[] bytes = new byte[maxCodeLength];
|
||||
// input.read(bytes, 0, minCodeLength);
|
||||
// for (int i = minCodeLength - 1; i < maxCodeLength; i++)
|
||||
// {
|
||||
// var byteCount = i + 1;
|
||||
// foreach (var range in codespaceRanges)
|
||||
// {
|
||||
// if (range.isFullMatch(bytes, byteCount))
|
||||
// {
|
||||
// return toInt(bytes, byteCount);
|
||||
// }
|
||||
// }
|
||||
// if (byteCount < maxCodeLength)
|
||||
// {
|
||||
// bytes[byteCount] = (byte)input.read();
|
||||
// }
|
||||
// }
|
||||
|
||||
// throw new InvalidOperationException("CMap is invalid");
|
||||
//}
|
||||
|
||||
|
||||
/**
|
||||
* Returns the CID for the given character code.
|
||||
*
|
||||
@@ -139,6 +112,57 @@
|
||||
{
|
||||
return cmapName;
|
||||
}
|
||||
}
|
||||
|
||||
public int ReadCode(IInputBytes bytes)
|
||||
{
|
||||
byte[] result = new byte[maxCodeLength];
|
||||
|
||||
result[0] = bytes.CurrentByte;
|
||||
|
||||
for (int i = 1; i < minCodeLength; i++)
|
||||
{
|
||||
result[i] = ReadByte(bytes);
|
||||
}
|
||||
|
||||
for (int i = minCodeLength - 1; i < maxCodeLength; i++)
|
||||
{
|
||||
int byteCount = i + 1;
|
||||
foreach (CodespaceRange range in CodespaceRanges)
|
||||
{
|
||||
if (range.isFullMatch(result, byteCount))
|
||||
{
|
||||
return ByteArrayToInt(result, byteCount);
|
||||
}
|
||||
}
|
||||
if (byteCount < maxCodeLength)
|
||||
{
|
||||
result[byteCount] = ReadByte(bytes);
|
||||
}
|
||||
}
|
||||
|
||||
throw new InvalidOperationException("CMap is invalid");
|
||||
}
|
||||
|
||||
private static byte ReadByte(IInputBytes bytes)
|
||||
{
|
||||
if (!bytes.MoveNext())
|
||||
{
|
||||
throw new InvalidOperationException("Read byte called on input bytes which was at end of byte set. Current offset: " + bytes.CurrentOffset);
|
||||
}
|
||||
|
||||
return bytes.CurrentByte;
|
||||
}
|
||||
|
||||
private static int ByteArrayToInt(byte[] data, int dataLen)
|
||||
{
|
||||
int code = 0;
|
||||
for (int i = 0; i < dataLen; ++i)
|
||||
{
|
||||
code <<= 8;
|
||||
code |= (data[i] & 0xFF);
|
||||
}
|
||||
return code;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
namespace UglyToad.Pdf.Fonts
|
||||
{
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using Cmap;
|
||||
using Cos;
|
||||
using Geometry;
|
||||
@@ -24,28 +26,53 @@
|
||||
|
||||
internal class CompositeFont : IFont
|
||||
{
|
||||
public CosName Name { get; }
|
||||
private readonly Dictionary<int, decimal> codeToWidthMap = new Dictionary<int, decimal>();
|
||||
|
||||
public CosName SubType { get; }
|
||||
public CosName Name { get; set; }
|
||||
|
||||
public CosName SubType { get; set; }
|
||||
|
||||
public string BaseFontType { get; }
|
||||
|
||||
public bool IsVertical { get; }
|
||||
|
||||
public CMap ToUnicode { get; }
|
||||
public CMap ToUnicode { get; set; }
|
||||
|
||||
public int ReadCharacterCode(IInputBytes bytes, out int codeLength)
|
||||
{
|
||||
throw new System.NotImplementedException();
|
||||
var current = bytes.CurrentOffset;
|
||||
|
||||
var code = ToUnicode.ReadCode(bytes);
|
||||
|
||||
codeLength = bytes.CurrentOffset - current;
|
||||
|
||||
return code;
|
||||
}
|
||||
|
||||
public string GetUnicode(int characterCode)
|
||||
{
|
||||
throw new System.NotImplementedException();
|
||||
if (ToUnicode != null)
|
||||
{
|
||||
if (ToUnicode.TryConvertToUnicode(characterCode, out string s)) return s;
|
||||
}
|
||||
|
||||
throw new NotImplementedException($"Could not locate the unicode for the character code {characterCode} in font {Name}.");
|
||||
}
|
||||
|
||||
public PdfVector GetDisplacement(int characterCode)
|
||||
{
|
||||
throw new System.NotImplementedException();
|
||||
var width = GetCharacterWidth(characterCode);
|
||||
return new PdfVector(width / 1000, 0);
|
||||
}
|
||||
|
||||
private decimal GetCharacterWidth(int characterCode)
|
||||
{
|
||||
if (codeToWidthMap.TryGetValue(characterCode, out var width))
|
||||
{
|
||||
return width;
|
||||
}
|
||||
|
||||
return 12000;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
{
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Diagnostics;
|
||||
using Content;
|
||||
using Fonts;
|
||||
using Geometry;
|
||||
@@ -19,9 +20,12 @@
|
||||
|
||||
public int StackSize => graphicsStack.Count;
|
||||
|
||||
public List<string> Texts = new List<string>();
|
||||
|
||||
public ContentStreamProcessor(PdfRectangle cropBox, IResourceStore resourceStore)
|
||||
{
|
||||
this.resourceStore = resourceStore;
|
||||
graphicsStack.Push(new CurrentGraphicsState());
|
||||
}
|
||||
|
||||
public PageContent Process(IReadOnlyList<IGraphicsStateOperation> operations)
|
||||
@@ -30,7 +34,11 @@
|
||||
|
||||
ProcessOperations(operations);
|
||||
|
||||
return new PageContent();
|
||||
return new PageContent
|
||||
{
|
||||
GraphicsStateOperations = operations,
|
||||
Text = Texts
|
||||
};
|
||||
}
|
||||
|
||||
private void ProcessOperations(IReadOnlyList<IGraphicsStateOperation> operations)
|
||||
@@ -49,6 +57,7 @@
|
||||
return saved;
|
||||
}
|
||||
|
||||
[DebuggerStepThrough]
|
||||
public CurrentGraphicsState GetCurrentState()
|
||||
{
|
||||
return graphicsStack.Peek();
|
||||
@@ -116,7 +125,7 @@
|
||||
private void ShowGlyph(TransformationMatrix renderingMatrix, IFont font,
|
||||
int characterCode, string unicode, PdfVector displacement)
|
||||
{
|
||||
throw new NotImplementedException();
|
||||
Texts.Add(unicode);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -15,7 +15,7 @@
|
||||
/// <summary>
|
||||
/// The <see cref="CurrentFontState"/> for this graphics state.
|
||||
/// </summary>
|
||||
public CurrentFontState FontState { get; set; }
|
||||
public CurrentFontState FontState { get; set; } = new CurrentFontState();
|
||||
|
||||
/// <summary>
|
||||
/// Thickness in user space units of path to be stroked.
|
||||
|
||||
@@ -2,9 +2,11 @@ namespace UglyToad.Pdf.Graphics
|
||||
{
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using System.Reflection;
|
||||
using Cos;
|
||||
using Operations;
|
||||
using Operations.TextShowing;
|
||||
using Tokenization.Tokens;
|
||||
|
||||
internal class ReflectionGraphicsStateOperationFactory : IGraphicsStateOperationFactory
|
||||
@@ -39,6 +41,28 @@ namespace UglyToad.Pdf.Graphics
|
||||
|
||||
public IGraphicsStateOperation Create(OperatorToken op, IReadOnlyList<IToken> operands)
|
||||
{
|
||||
switch (op.Data)
|
||||
{
|
||||
case ShowText.Symbol:
|
||||
if (operands.Count != 1)
|
||||
{
|
||||
throw new InvalidOperationException($"Attempted to create a show text operation with {operands.Count} operands.");
|
||||
}
|
||||
|
||||
if (operands[0] is StringToken s)
|
||||
{
|
||||
return new ShowText(s.Data);
|
||||
}
|
||||
else if (operands[0] is HexToken h)
|
||||
{
|
||||
return new ShowText(h.Bytes.ToArray());
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new InvalidOperationException($"Tried to create a show text operation with operand type: {operands[0]?.GetType().Name ?? "null"}");
|
||||
}
|
||||
}
|
||||
|
||||
if (!operations.TryGetValue(op.Data, out Type operationType))
|
||||
{
|
||||
return null;
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
namespace UglyToad.Pdf.Util
|
||||
{
|
||||
using Filters;
|
||||
using Fonts.Parser;
|
||||
using Graphics;
|
||||
using Logging;
|
||||
using Parser;
|
||||
@@ -51,6 +52,8 @@
|
||||
var pageContentParser = new PageContentParser();
|
||||
var operationFactory = new ReflectionGraphicsStateOperationFactory();
|
||||
|
||||
var cmapParser = new CMapParser();
|
||||
|
||||
var container = new Container();
|
||||
container.Register(headerParser);
|
||||
container.Register(trailerParser);
|
||||
@@ -70,6 +73,7 @@
|
||||
container.Register(fontParser);
|
||||
container.Register(pageContentParser);
|
||||
container.Register(operationFactory);
|
||||
container.Register(cmapParser);
|
||||
|
||||
return container;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user