diff --git a/src/UglyToad.PdfPig.Tests/Integration/Documents/ShowTextOpWithUnbalancedRoundBrackets.pdf b/src/UglyToad.PdfPig.Tests/Integration/Documents/ShowTextOpWithUnbalancedRoundBrackets.pdf new file mode 100644 index 00000000..a2475e38 Binary files /dev/null and b/src/UglyToad.PdfPig.Tests/Integration/Documents/ShowTextOpWithUnbalancedRoundBrackets.pdf differ diff --git a/src/UglyToad.PdfPig.Tests/Integration/ShowTextEscapeText.cs b/src/UglyToad.PdfPig.Tests/Integration/ShowTextEscapeText.cs new file mode 100644 index 00000000..0ea5054c --- /dev/null +++ b/src/UglyToad.PdfPig.Tests/Integration/ShowTextEscapeText.cs @@ -0,0 +1,48 @@ +namespace UglyToad.PdfPig.Tests.Integration +{ + using System.Linq; + using UglyToad.PdfPig; + using UglyToad.PdfPig.Writer; + using Xunit; + + public class ShowTextEscapeText + { + private static string GetFilename() + { + // On the single page of the source PDF has 3 ShowText operations with unbalanced round brackets in the text. + // Unbalanced meaning there is an open bracket without a close bracket or close without open. + // 1. line 387 (\() Tj + // 2. line 397 (\)) Tj + // 3. line 384 ( \(I\222ll try to stay on ) Tj + // note in text 3 the 0222 (octal) or (0x92 hex) is similar to an apostople ' so text is similar to " (I'll try to stay on" (with an open bracket). + return IntegrationHelpers.GetDocumentPath("ShowTextOpWithUnbalancedRoundBrackets.pdf"); + } + + [Fact] + public void PdfCopyShowTextOpUsesEscapedText() + { + var filePath = GetFilename(); + using (var sourceDocument = PdfDocument.Open(filePath)) + { + PdfDocumentBuilder pdfBuilder = new PdfDocumentBuilder(); + var numberOfPages = sourceDocument.NumberOfPages; + int pageNumber = 1; ////for (int pageNumber = 1; pageNumber <= numberOfPages; pageNumber++) + { + var sourcePage = sourceDocument.GetPage(pageNumber); + + pdfBuilder.AddPage(sourcePage.Width, sourcePage.Height).CopyFrom(sourcePage); + } + var pdfBytes = pdfBuilder.Build(); + + // Reread (in memory) copied PDF and check example text ("wander") exists in word extract after ShowText operation with unbalanced bracket. + using (var document = PdfDocument.Open(pdfBytes)) + { + var page = document.GetPage(1); + var words = page.GetWords(); + var isExpectedTextInCopiedPdf = words.Any(w => w.Text.Contains("wander")); + Assert.True(isExpectedTextInCopiedPdf); + } + } + } + } +} \ No newline at end of file diff --git a/src/UglyToad.PdfPig/Graphics/Operations/TextShowing/ShowText.cs b/src/UglyToad.PdfPig/Graphics/Operations/TextShowing/ShowText.cs index 3dae066f..11e589b1 100644 --- a/src/UglyToad.PdfPig/Graphics/Operations/TextShowing/ShowText.cs +++ b/src/UglyToad.PdfPig/Graphics/Operations/TextShowing/ShowText.cs @@ -67,16 +67,40 @@ operationContext.ShowText(input); } + string EscapeText(string text) + { + if (text is null) return null; + // Fix Issue 350 from PDF Spec 1.7 (page 408) on handling 'special characters' of '(', ')' and '\'. + + // The strings must conform to the syntax for string objects. + // When a string is written by enclosing the data in parentheses, + // bytes whose values are the same as those + // of the ASCII characters left parenthesis (40), right parenthesis (41), and backslash (92) + // must be preceded by a backslash character. + // All other byte values between 0 and 255 may be used in a string object. + // These rules apply to each individual byte in a string object, whether the string is interpreted by the text-showing operators + // as single-byte or multiple-byte character codes. + + // Note: order of replacing is important. Replace slash first before brackets. + text = text.Replace(@"\", @"\\)"); // Escape any slash '\' -> '\\' + text = text.Replace("(", @"\("); // Escape any open brackets '(' -> '\(' + text = text.Replace(")", @"\)"); // Escape any close brackets ')' -> '\)' + + return text; + } + /// public void Write(Stream stream) { + if (Bytes != null) { stream.WriteHex(Bytes); } else { - stream.WriteText($"({Text})"); + var EscapedText = EscapeText(Text); // escape '(', ')' or '\' + stream.WriteText($"({EscapedText})"); } stream.WriteWhiteSpace();