mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-10-14 02:44:58 +08:00
Add basic marked content extraction capabilities
This commit is contained in:
@@ -91,6 +91,16 @@
|
||||
{
|
||||
}
|
||||
|
||||
public void BeginMarkedContent(NameToken name, NameToken propertyDictionaryName, DictionaryToken properties)
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
public void EndMarkedContent()
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
private class TestFontFactory : IFontFactory
|
||||
{
|
||||
public IFont Get(DictionaryToken dictionary, bool isLenientParsing)
|
||||
|
@@ -153,6 +153,12 @@
|
||||
/// </summary>
|
||||
public IEnumerable<IPdfImage> GetImages() => Content.GetImages();
|
||||
|
||||
/// <summary>
|
||||
///
|
||||
/// </summary>
|
||||
/// <returns></returns>
|
||||
public IReadOnlyList<PdfMarkedContent> GetMarkedContents() => Content.GetMarkedContents();
|
||||
|
||||
/// <summary>
|
||||
/// Provides access to useful members which will change in future releases.
|
||||
/// </summary>
|
||||
|
@@ -20,6 +20,7 @@
|
||||
internal class PageContent
|
||||
{
|
||||
private readonly IReadOnlyList<Union<XObjectContentRecord, InlineImage>> images;
|
||||
private readonly IReadOnlyList<PdfMarkedContent> markedContents;
|
||||
private readonly IPdfTokenScanner pdfScanner;
|
||||
private readonly IFilterProvider filterProvider;
|
||||
private readonly IResourceStore resourceStore;
|
||||
@@ -34,6 +35,7 @@
|
||||
internal PageContent(IReadOnlyList<IGraphicsStateOperation> graphicsStateOperations, IReadOnlyList<Letter> letters,
|
||||
IReadOnlyList<PdfPath> paths,
|
||||
IReadOnlyList<Union<XObjectContentRecord, InlineImage>> images,
|
||||
IReadOnlyList<PdfMarkedContent> markedContents,
|
||||
IPdfTokenScanner pdfScanner,
|
||||
IFilterProvider filterProvider,
|
||||
IResourceStore resourceStore,
|
||||
@@ -43,6 +45,7 @@
|
||||
Letters = letters;
|
||||
Paths = paths;
|
||||
this.images = images;
|
||||
this.markedContents = markedContents;
|
||||
this.pdfScanner = pdfScanner ?? throw new ArgumentNullException(nameof(pdfScanner));
|
||||
this.filterProvider = filterProvider ?? throw new ArgumentNullException(nameof(filterProvider));
|
||||
this.resourceStore = resourceStore ?? throw new ArgumentNullException(nameof(resourceStore));
|
||||
@@ -61,5 +64,7 @@
|
||||
yield return result;
|
||||
}
|
||||
}
|
||||
|
||||
public IReadOnlyList<PdfMarkedContent> GetMarkedContents() => markedContents;
|
||||
}
|
||||
}
|
||||
|
159
src/UglyToad.PdfPig/Content/PdfArtifactMarkedContent.cs
Normal file
159
src/UglyToad.PdfPig/Content/PdfArtifactMarkedContent.cs
Normal file
@@ -0,0 +1,159 @@
|
||||
namespace UglyToad.PdfPig.Content
|
||||
{
|
||||
using System;
|
||||
using System.Linq;
|
||||
using UglyToad.PdfPig.Core;
|
||||
using UglyToad.PdfPig.Tokens;
|
||||
|
||||
/// <summary>
|
||||
/// Artifacts are graphics objects that are not part of the author’s original content but rather are
|
||||
/// generated by the conforming writer in the course of pagination, layout, or other strictly mechanical
|
||||
/// processes.
|
||||
/// <para>Artifacts may also be used to describe areas of the document where the author uses a graphical
|
||||
/// background, with the goal of enhancing the visual experience. In such a case, the background is not
|
||||
/// required for understanding the content. - PDF 32000-1:2008, Section 14.8.2.2</para>
|
||||
/// </summary>
|
||||
public class PdfArtifactMarkedContent : PdfMarkedContent
|
||||
{
|
||||
internal PdfArtifactMarkedContent(int id, DictionaryToken properties) : base(id, NameToken.Artifact, properties)
|
||||
{
|
||||
IsArtifact = true;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// The artifact's type: Pagination, Layout, Page, or (PDF 1.7) Background.
|
||||
/// </summary>
|
||||
public ArtifactType Type
|
||||
{
|
||||
get
|
||||
{
|
||||
if (Properties == null) return ArtifactType.Unknown;
|
||||
if (Properties.TryGet(NameToken.Type, out IDataToken<string> typeToken))
|
||||
{
|
||||
if (Enum.TryParse(typeToken.Data, true, out ArtifactType result))
|
||||
{
|
||||
return result;
|
||||
}
|
||||
}
|
||||
return ArtifactType.Unknown;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// The artifact's attribute owners.
|
||||
/// </summary>
|
||||
public string AttributeOwners
|
||||
{
|
||||
get
|
||||
{
|
||||
if (Properties == null) return null;
|
||||
if (Properties.TryGet(NameToken.O, out IDataToken<string> typeToken))
|
||||
{
|
||||
return typeToken.Data;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// The artifact's bounding box.
|
||||
/// </summary>
|
||||
public PdfRectangle? BoundingBox
|
||||
{
|
||||
get
|
||||
{
|
||||
if (Properties == null) return null;
|
||||
if (Properties.TryGet(NameToken.Bbox, out ArrayToken arrayToken))
|
||||
{
|
||||
var left = arrayToken[2] as NumericToken;
|
||||
var bottom = arrayToken[3] as NumericToken;
|
||||
var right = arrayToken[4] as NumericToken;
|
||||
var top = arrayToken[5] as NumericToken;
|
||||
return new PdfRectangle((double)left.Data, (double)bottom.Data, (double)right.Data, (double)top.Data);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Is the artifact attached to the top edge?
|
||||
/// </summary>
|
||||
public bool IsTopAttached => IsAttached("Top");
|
||||
|
||||
/// <summary>
|
||||
/// Is the artifact attached to the bottom edge?
|
||||
/// </summary>
|
||||
public bool IsBottomAttached => IsAttached("Bottom");
|
||||
|
||||
/// <summary>
|
||||
/// Is the artifact attached to the left edge?
|
||||
/// </summary>
|
||||
public bool IsLeftAttached => IsAttached("Left");
|
||||
|
||||
/// <summary>
|
||||
/// Is the artifact attached to the right edge?
|
||||
/// </summary>
|
||||
public bool IsRightAttached => IsAttached("Right");
|
||||
|
||||
/// <summary>
|
||||
/// The artifact's subtype. Standard values are Header, Footer, and Watermark. Additional values may be specified for this entry, provided they comply with the naming conventions.
|
||||
/// </summary>
|
||||
public string SubType
|
||||
{
|
||||
get
|
||||
{
|
||||
if (Properties == null) return null;
|
||||
if (Properties.TryGet(NameToken.Subtype, out IDataToken<string> subTypeToken))
|
||||
{
|
||||
return subTypeToken.Data;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private bool IsAttached(string edge)
|
||||
{
|
||||
if (Properties == null) return false;
|
||||
if (this.Properties.TryGet(NameToken.Attached, out ArrayToken arrayToken))
|
||||
{
|
||||
return arrayToken.Data.Contains(NameToken.Create(edge));
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// If present, shall be one of the names Pagination, Layout, Page, or (PDF 1.7) Background.
|
||||
/// </summary>
|
||||
public enum ArtifactType
|
||||
{
|
||||
/// <summary>
|
||||
/// Unknown artifact type.
|
||||
/// </summary>
|
||||
Unknown,
|
||||
|
||||
/// <summary>
|
||||
/// Ancillary page features such as running heads and folios (page numbers).
|
||||
/// </summary>
|
||||
Pagination,
|
||||
|
||||
/// <summary>
|
||||
/// Purely cosmetic typographical or design elements such as footnote rules or background screens.
|
||||
/// </summary>
|
||||
Layout,
|
||||
|
||||
/// <summary>
|
||||
/// Production aids extraneous to the document itself, such as cut marks and colour bars.
|
||||
/// </summary>
|
||||
Page,
|
||||
|
||||
/// <summary>
|
||||
/// (PDF 1.7) Images, patterns or coloured blocks that either run the entire length and/or
|
||||
/// width of the page or the entire dimensions of a structural element. Background artifacts
|
||||
/// typically serve as a background for content shown either on top of or placed adjacent to
|
||||
/// that background.
|
||||
/// <para>A background artifact can further be classified as visual content that serves to enhance the user experience, that lies under the actual content, and that is not required except to retain visual fidelity.</para>
|
||||
/// </summary>
|
||||
Background
|
||||
}
|
||||
}
|
190
src/UglyToad.PdfPig/Content/PdfMarkedContent.cs
Normal file
190
src/UglyToad.PdfPig/Content/PdfMarkedContent.cs
Normal file
@@ -0,0 +1,190 @@
|
||||
namespace UglyToad.PdfPig.Content
|
||||
{
|
||||
using System.Collections.Generic;
|
||||
using UglyToad.PdfPig.Core;
|
||||
using UglyToad.PdfPig.Graphics;
|
||||
using UglyToad.PdfPig.Tokens;
|
||||
using UglyToad.PdfPig.Util;
|
||||
|
||||
/// <summary>
|
||||
///
|
||||
/// </summary>
|
||||
public class PdfMarkedContent
|
||||
{
|
||||
private readonly List<IPdfImage> images = new List<IPdfImage>();
|
||||
private readonly List<PdfPath> pdfPaths = new List<PdfPath>();
|
||||
private readonly List<Letter> letters = new List<Letter>();
|
||||
private readonly List<XObjectContentRecord> xObjectContentRecords = new List<XObjectContentRecord>();
|
||||
|
||||
internal PdfMarkedContent(int id, NameToken tag, DictionaryToken properties)
|
||||
{
|
||||
this.Id = id;
|
||||
this.Tag = tag;
|
||||
this.Properties = properties;
|
||||
this.ChildContents = new List<PdfMarkedContent>();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Is the marked content an artifact.
|
||||
/// </summary>
|
||||
public bool IsArtifact { get; internal set; }
|
||||
|
||||
/// <summary>
|
||||
/// Internal Id for top marked content. Child marked contents will share the same Id as the parent.
|
||||
/// </summary>
|
||||
public int Id { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Marked-content identifier.
|
||||
/// </summary>
|
||||
public int MCID
|
||||
{
|
||||
get
|
||||
{
|
||||
if (Properties == null) return -1;
|
||||
if (Properties.ContainsKey(NameToken.Mcid))
|
||||
{
|
||||
return Properties.GetInt(NameToken.Mcid);
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
///
|
||||
/// </summary>
|
||||
public string Tag { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Properties.
|
||||
/// </summary>
|
||||
public DictionaryToken Properties { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Child contents.
|
||||
/// </summary>
|
||||
public List<PdfMarkedContent> ChildContents { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The natural language specification.
|
||||
/// </summary>
|
||||
public string Language
|
||||
{
|
||||
get
|
||||
{
|
||||
if (Properties == null) return null;
|
||||
if (Properties.TryGet(NameToken.Lang, out IDataToken<string> langToken))
|
||||
{
|
||||
return langToken.Data;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// The replacement text.
|
||||
/// </summary>
|
||||
public string ActualText
|
||||
{
|
||||
get
|
||||
{
|
||||
if (Properties == null) return null;
|
||||
if (Properties.TryGet(NameToken.ActualText, out IDataToken<string> textToken))
|
||||
{
|
||||
return textToken.Data;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// The alternate description.
|
||||
/// </summary>
|
||||
public string AlternateDescription
|
||||
{
|
||||
get
|
||||
{
|
||||
if (Properties == null) return null;
|
||||
if (Properties.TryGet(NameToken.Alternate, out IDataToken<string> textToken))
|
||||
{
|
||||
return textToken.Data;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// The abbreviation expansion text.
|
||||
/// </summary>
|
||||
public string ExpandedForm
|
||||
{
|
||||
get
|
||||
{
|
||||
if (Properties == null) return null;
|
||||
if (Properties.TryGet(NameToken.E, out IDataToken<string> textToken))
|
||||
{
|
||||
return textToken.Data;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// The marked content's images.
|
||||
/// </summary>
|
||||
public IReadOnlyList<IPdfImage> Images => images;
|
||||
|
||||
/// <summary>
|
||||
/// The marked content's paths.
|
||||
/// </summary>
|
||||
public IReadOnlyList<PdfPath> PdfPaths => pdfPaths;
|
||||
|
||||
/// <summary>
|
||||
/// The marked content's letters.
|
||||
/// </summary>
|
||||
public IReadOnlyList<Letter> Letters => letters;
|
||||
|
||||
internal void Add(IPdfImage pdfImage)
|
||||
{
|
||||
images.Add(pdfImage);
|
||||
}
|
||||
|
||||
internal void Add(PdfPath pdfPath)
|
||||
{
|
||||
pdfPaths.Add(pdfPath);
|
||||
}
|
||||
|
||||
internal void Add(Letter letter)
|
||||
{
|
||||
letters.Add(letter);
|
||||
}
|
||||
|
||||
internal void Add(XObjectContentRecord xObjectContentRecord)
|
||||
{
|
||||
xObjectContentRecords.Add(xObjectContentRecord);
|
||||
}
|
||||
|
||||
internal void Add(PdfMarkedContent markedContent)
|
||||
{
|
||||
ChildContents.Add(markedContent);
|
||||
}
|
||||
|
||||
internal static PdfMarkedContent Create(int id, NameToken name, DictionaryToken properties)
|
||||
{
|
||||
if (name.Equals(NameToken.Artifact))
|
||||
{
|
||||
return new PdfArtifactMarkedContent(id, properties);
|
||||
}
|
||||
else
|
||||
{
|
||||
return new PdfMarkedContent(id, name, properties);
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public override string ToString()
|
||||
{
|
||||
return "Id=" + Id + ", Tag=" + this.Tag + ", Properties=" + this.Properties + ", Contents=" + this.ChildContents.Count;
|
||||
}
|
||||
}
|
||||
}
|
@@ -24,6 +24,14 @@
|
||||
|
||||
internal class ContentStreamProcessor : IOperationContext
|
||||
{
|
||||
private readonly Stack<PdfMarkedContent> queuedMarkedContents = new Stack<PdfMarkedContent>();
|
||||
private int currentMarkedContentId;
|
||||
|
||||
/// <summary>
|
||||
/// Stores each marked content as it is encountered in the content stream.
|
||||
/// </summary>
|
||||
private readonly List<PdfMarkedContent> markedContents = new List<PdfMarkedContent>();
|
||||
|
||||
/// <summary>
|
||||
/// Stores each letter as it is encountered in the content stream.
|
||||
/// </summary>
|
||||
@@ -103,7 +111,7 @@
|
||||
|
||||
ProcessOperations(operations);
|
||||
|
||||
return new PageContent(operations, letters, paths, images, pdfScanner, filterProvider, resourceStore, isLenientParsing);
|
||||
return new PageContent(operations, letters, paths, images, markedContents, pdfScanner, filterProvider, resourceStore, isLenientParsing);
|
||||
}
|
||||
|
||||
private void ProcessOperations(IReadOnlyList<IGraphicsStateOperation> operations)
|
||||
@@ -227,6 +235,11 @@
|
||||
pointSize,
|
||||
textSequence);
|
||||
|
||||
if (queuedMarkedContents.Any())
|
||||
{
|
||||
queuedMarkedContents.Peek().Add(letter);
|
||||
}
|
||||
|
||||
letters.Add(letter);
|
||||
|
||||
double tx, ty;
|
||||
@@ -312,11 +325,21 @@
|
||||
|
||||
if (subType.Equals(NameToken.Ps))
|
||||
{
|
||||
xObjects[XObjectType.PostScript].Add(new XObjectContentRecord(XObjectType.PostScript, xObjectStream, matrix, state.RenderingIntent));
|
||||
var contentRecord = new XObjectContentRecord(XObjectType.PostScript, xObjectStream, matrix, state.RenderingIntent);
|
||||
if (queuedMarkedContents.Any())
|
||||
{
|
||||
queuedMarkedContents.Peek().Add(contentRecord);
|
||||
}
|
||||
xObjects[XObjectType.PostScript].Add(contentRecord);
|
||||
}
|
||||
else if (subType.Equals(NameToken.Image))
|
||||
{
|
||||
images.Add(Union<XObjectContentRecord, InlineImage>.One(new XObjectContentRecord(XObjectType.Image, xObjectStream, matrix, state.RenderingIntent)));
|
||||
var contentRecord = new XObjectContentRecord(XObjectType.Image, xObjectStream, matrix, state.RenderingIntent);
|
||||
if (queuedMarkedContents.Any())
|
||||
{
|
||||
queuedMarkedContents.Peek().Add(contentRecord);
|
||||
}
|
||||
images.Add(Union<XObjectContentRecord, InlineImage>.One(contentRecord));
|
||||
}
|
||||
else if (subType.Equals(NameToken.Form))
|
||||
{
|
||||
@@ -384,6 +407,10 @@
|
||||
{
|
||||
if (CurrentPath != null && CurrentPath.Commands.Count > 0 && !currentPathAdded)
|
||||
{
|
||||
if (queuedMarkedContents.Any())
|
||||
{
|
||||
queuedMarkedContents.Peek().Add(CurrentPath);
|
||||
}
|
||||
paths.Add(CurrentPath);
|
||||
}
|
||||
|
||||
@@ -399,6 +426,10 @@
|
||||
}
|
||||
else
|
||||
{
|
||||
if (queuedMarkedContents.Any())
|
||||
{
|
||||
queuedMarkedContents.Peek().Add(CurrentPath);
|
||||
}
|
||||
paths.Add(CurrentPath);
|
||||
currentPathAdded = true;
|
||||
}
|
||||
@@ -412,6 +443,10 @@
|
||||
}
|
||||
else
|
||||
{
|
||||
if (queuedMarkedContents.Any())
|
||||
{
|
||||
queuedMarkedContents.Peek().Add(CurrentPath);
|
||||
}
|
||||
paths.Add(CurrentPath);
|
||||
currentPathAdded = true;
|
||||
}
|
||||
@@ -420,6 +455,10 @@
|
||||
public void ClosePath()
|
||||
{
|
||||
CurrentPath.ClosePath();
|
||||
if (queuedMarkedContents.Any())
|
||||
{
|
||||
queuedMarkedContents.Peek().Add(CurrentPath);
|
||||
}
|
||||
paths.Add(CurrentPath);
|
||||
CurrentPath = null;
|
||||
currentPathAdded = false;
|
||||
@@ -496,11 +535,54 @@
|
||||
|
||||
var image = inlineImageBuilder.CreateInlineImage(CurrentTransformationMatrix, filterProvider, pdfScanner, GetCurrentState().RenderingIntent, resourceStore);
|
||||
|
||||
if (queuedMarkedContents.Any())
|
||||
{
|
||||
queuedMarkedContents.Peek().Add(image);
|
||||
}
|
||||
|
||||
images.Add(Union<XObjectContentRecord, InlineImage>.Two(image));
|
||||
|
||||
inlineImageBuilder = null;
|
||||
}
|
||||
|
||||
public void BeginMarkedContent(NameToken name, NameToken propertyDictionaryName, DictionaryToken properties)
|
||||
{
|
||||
if (!queuedMarkedContents.Any()) currentMarkedContentId++; // top parent id only
|
||||
|
||||
var markedContent = PdfMarkedContent.Create(currentMarkedContentId, name, properties);
|
||||
|
||||
if (propertyDictionaryName != null)
|
||||
{
|
||||
log.Error("BeginMarkedContent(): propertyDictionaryName not null to implement, name="
|
||||
+ name.Data + ", propertyDictionaryName=" + propertyDictionaryName);
|
||||
markedContent = PdfMarkedContent.Create(currentMarkedContentId, propertyDictionaryName, properties);
|
||||
}
|
||||
|
||||
if (queuedMarkedContents.Any())
|
||||
{
|
||||
var currentMarkedContent = queuedMarkedContents.Peek();
|
||||
if (currentMarkedContent != null)
|
||||
{
|
||||
currentMarkedContent.Add(markedContent);
|
||||
}
|
||||
}
|
||||
|
||||
queuedMarkedContents.Push(markedContent);
|
||||
}
|
||||
|
||||
public void EndMarkedContent()
|
||||
{
|
||||
if (queuedMarkedContents.Any())
|
||||
{
|
||||
var mc = queuedMarkedContents.Pop();
|
||||
|
||||
if (!queuedMarkedContents.Any())
|
||||
{
|
||||
markedContents.Add(mc);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void AdjustTextMatrix(double tx, double ty)
|
||||
{
|
||||
var matrix = TransformationMatrix.GetTranslationMatrix(tx, ty);
|
||||
|
@@ -1,9 +1,8 @@
|
||||
namespace UglyToad.PdfPig.Graphics
|
||||
{
|
||||
using System.Collections.Generic;
|
||||
using Geometry;
|
||||
using Tokens;
|
||||
using PdfPig.Core;
|
||||
using System.Collections.Generic;
|
||||
using Tokens;
|
||||
using Util.JetBrains.Annotations;
|
||||
|
||||
/// <summary>
|
||||
@@ -98,6 +97,16 @@
|
||||
/// </summary>
|
||||
void ClosePath();
|
||||
|
||||
/// <summary>
|
||||
///
|
||||
/// </summary>
|
||||
void BeginMarkedContent(NameToken name, NameToken propertyDictionaryName, DictionaryToken Properties);
|
||||
|
||||
/// <summary>
|
||||
///
|
||||
/// </summary>
|
||||
void EndMarkedContent();
|
||||
|
||||
/// <summary>
|
||||
/// Update the graphics state to apply the state from the named ExtGState dictionary.
|
||||
/// </summary>
|
||||
|
@@ -35,6 +35,7 @@
|
||||
/// <inheritdoc />
|
||||
public void Run(IOperationContext operationContext)
|
||||
{
|
||||
operationContext.BeginMarkedContent(Name, null, null);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
|
@@ -63,6 +63,7 @@
|
||||
/// <inheritdoc />
|
||||
public void Run(IOperationContext operationContext)
|
||||
{
|
||||
operationContext.BeginMarkedContent(Name, PropertyDictionaryName, Properties);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
|
@@ -28,6 +28,7 @@
|
||||
/// <inheritdoc />
|
||||
public void Run(IOperationContext operationContext)
|
||||
{
|
||||
operationContext.EndMarkedContent();
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
|
Reference in New Issue
Block a user