mirror of
				https://github.com/UglyToad/PdfPig.git
				synced 2025-10-31 16:46:55 +08:00 
			
		
		
		
	Merge pull request #440 from UglyToad/diacritics-stuff
add handling for combining diacritics #439
This commit is contained in:
		| @@ -42,6 +42,5 @@ | |||||||
|                 Assert.Contains("financial results for the fiscal quarter ended June 30, 2017 and (2) a conference call to discuss those results and Farmer Mac", page.Text); |                 Assert.Contains("financial results for the fiscal quarter ended June 30, 2017 and (2) a conference call to discuss those results and Farmer Mac", page.Text); | ||||||
|             } |             } | ||||||
|         } |         } | ||||||
|  |  | ||||||
|     } |     } | ||||||
| } | } | ||||||
| @@ -0,0 +1,18 @@ | |||||||
|  | namespace UglyToad.PdfPig.Tests.Integration; | ||||||
|  |  | ||||||
|  | using Xunit; | ||||||
|  |  | ||||||
|  | public class Math119FakingDataTests | ||||||
|  | { | ||||||
|  |     [Fact] | ||||||
|  |     public void CombinesDiaeresisForWords() | ||||||
|  |     { | ||||||
|  |         using var document = PdfDocument.Open(IntegrationHelpers.GetDocumentPath("Math119FakingData.pdf")); | ||||||
|  |  | ||||||
|  |         var lastPage = document.GetPage(8); | ||||||
|  |  | ||||||
|  |         var words = lastPage.GetWords(); | ||||||
|  |  | ||||||
|  |  | ||||||
|  |     } | ||||||
|  | } | ||||||
| @@ -17,6 +17,7 @@ | |||||||
|     using Tokenization.Scanner; |     using Tokenization.Scanner; | ||||||
|     using Tokens; |     using Tokens; | ||||||
|     using Operations.TextPositioning; |     using Operations.TextPositioning; | ||||||
|  |     using Util; | ||||||
|     using XObjects; |     using XObjects; | ||||||
|     using static PdfPig.Core.PdfSubpath; |     using static PdfPig.Core.PdfSubpath; | ||||||
|  |  | ||||||
| @@ -293,15 +294,58 @@ | |||||||
|                     ? currentState.CurrentNonStrokingColor |                     ? currentState.CurrentNonStrokingColor | ||||||
|                     : currentState.CurrentStrokingColor; |                     : currentState.CurrentStrokingColor; | ||||||
|  |  | ||||||
|                 var letter = new Letter(unicode, transformedGlyphBounds, |                 Letter letter = null; | ||||||
|                     transformedPdfBounds.BottomLeft, |                 if (Diacritics.IsInCombiningDiacriticRange(unicode) && bytes.CurrentOffset > 0 && letters.Count > 0) | ||||||
|                     transformedPdfBounds.BottomRight, |                 { | ||||||
|                     transformedPdfBounds.Width, |                     var attachTo = letters[letters.Count - 1]; | ||||||
|                     fontSize, |  | ||||||
|                     font.Details, |                     if (attachTo.TextSequence == textSequence | ||||||
|                     color, |                         && Diacritics.TryCombineDiacriticWithPreviousLetter(unicode, attachTo.Value, out var newLetter)) | ||||||
|                     pointSize, |                     { | ||||||
|                     textSequence); |                         // TODO: union of bounding boxes. | ||||||
|  |                         letters.Remove(attachTo); | ||||||
|  |  | ||||||
|  |                         letter = new Letter( | ||||||
|  |                             newLetter, | ||||||
|  |                             attachTo.GlyphRectangle, | ||||||
|  |                             attachTo.StartBaseLine, | ||||||
|  |                             attachTo.EndBaseLine, | ||||||
|  |                             attachTo.Width, | ||||||
|  |                             attachTo.FontSize, | ||||||
|  |                             attachTo.Font, | ||||||
|  |                             attachTo.Color, | ||||||
|  |                             attachTo.PointSize, | ||||||
|  |                             attachTo.TextSequence); | ||||||
|  |                     } | ||||||
|  |                     else | ||||||
|  |                     { | ||||||
|  |                         letter = new Letter( | ||||||
|  |                             unicode, | ||||||
|  |                             transformedGlyphBounds, | ||||||
|  |                             transformedPdfBounds.BottomLeft, | ||||||
|  |                             transformedPdfBounds.BottomRight, | ||||||
|  |                             transformedPdfBounds.Width, | ||||||
|  |                             fontSize, | ||||||
|  |                             font.Details, | ||||||
|  |                             color, | ||||||
|  |                             pointSize, | ||||||
|  |                             textSequence); | ||||||
|  |                     } | ||||||
|  |                 } | ||||||
|  |                 else | ||||||
|  |                 { | ||||||
|  |                     letter = new Letter( | ||||||
|  |                         unicode, | ||||||
|  |                         transformedGlyphBounds, | ||||||
|  |                         transformedPdfBounds.BottomLeft, | ||||||
|  |                         transformedPdfBounds.BottomRight, | ||||||
|  |                         transformedPdfBounds.Width, | ||||||
|  |                         fontSize, | ||||||
|  |                         font.Details, | ||||||
|  |                         color, | ||||||
|  |                         pointSize, | ||||||
|  |                         textSequence); | ||||||
|  |                 } | ||||||
|  |  | ||||||
|                 letters.Add(letter); |                 letters.Add(letter); | ||||||
|  |  | ||||||
|   | |||||||
							
								
								
									
										75
									
								
								src/UglyToad.PdfPig/Util/Diacritics.cs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										75
									
								
								src/UglyToad.PdfPig/Util/Diacritics.cs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,75 @@ | |||||||
|  | namespace UglyToad.PdfPig.Util | ||||||
|  | { | ||||||
|  |     using System; | ||||||
|  |     using System.Collections.Generic; | ||||||
|  |     using System.Globalization; | ||||||
|  |  | ||||||
|  |     internal static class Diacritics | ||||||
|  |     { | ||||||
|  |         private static readonly HashSet<string> NonCombiningDiacritics = new HashSet<string> | ||||||
|  |         { | ||||||
|  |             "´", | ||||||
|  |             "^", | ||||||
|  |             "ˆ", | ||||||
|  |             "¨", | ||||||
|  |             "©", | ||||||
|  |             "™", | ||||||
|  |             "®", | ||||||
|  |             "`", | ||||||
|  |             "˜", | ||||||
|  |             "∼", | ||||||
|  |             "¸" | ||||||
|  |         }; | ||||||
|  |  | ||||||
|  |         public static bool IsPotentialStandaloneDiacritic(string value) => NonCombiningDiacritics.Contains(value); | ||||||
|  |  | ||||||
|  |         public static bool IsInCombiningDiacriticRange(string value) | ||||||
|  |         { | ||||||
|  |             if (value.Length != 1) | ||||||
|  |             { | ||||||
|  |                 return false; | ||||||
|  |             } | ||||||
|  |  | ||||||
|  |             var intVal = (int)value[0]; | ||||||
|  |  | ||||||
|  |             if (intVal >= 768 && intVal <= 879) | ||||||
|  |             { | ||||||
|  |                 return true; | ||||||
|  |             } | ||||||
|  |  | ||||||
|  |             return false; | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         public static bool TryCombineDiacriticWithPreviousLetter(string diacritic, string previous, out string result) | ||||||
|  |         { | ||||||
|  |             result = null; | ||||||
|  |  | ||||||
|  |             if (previous == null) | ||||||
|  |             { | ||||||
|  |                 return false; | ||||||
|  |             } | ||||||
|  |  | ||||||
|  |             result = previous + diacritic; | ||||||
|  |  | ||||||
|  |             // On combining the length should remain equal. | ||||||
|  |             var beforeCombination = MeasureDiacriticAwareLength(previous); | ||||||
|  |             var afterCombination = MeasureDiacriticAwareLength(result); | ||||||
|  |  | ||||||
|  |             return beforeCombination == afterCombination; | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         private static int MeasureDiacriticAwareLength(string input) | ||||||
|  |         { | ||||||
|  |             var length = 0; | ||||||
|  |  | ||||||
|  |             var enumerator = StringInfo.GetTextElementEnumerator(input); | ||||||
|  |             while (enumerator.MoveNext()) | ||||||
|  |             { | ||||||
|  |                 var grapheme = enumerator.GetTextElement(); | ||||||
|  |                 length++; | ||||||
|  |             } | ||||||
|  |  | ||||||
|  |             return length; | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | } | ||||||
		Reference in New Issue
	
	Block a user
	 Eliot Jones
					Eliot Jones