mirror of
				https://github.com/UglyToad/PdfPig.git
				synced 2025-10-31 16:46:55 +08:00 
			
		
		
		
	Merge pull request #440 from UglyToad/diacritics-stuff
add handling for combining diacritics #439
This commit is contained in:
		| @@ -42,6 +42,5 @@ | ||||
|                 Assert.Contains("financial results for the fiscal quarter ended June 30, 2017 and (2) a conference call to discuss those results and Farmer Mac", page.Text); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|     } | ||||
| } | ||||
| @@ -0,0 +1,18 @@ | ||||
| namespace UglyToad.PdfPig.Tests.Integration; | ||||
|  | ||||
| using Xunit; | ||||
|  | ||||
| public class Math119FakingDataTests | ||||
| { | ||||
|     [Fact] | ||||
|     public void CombinesDiaeresisForWords() | ||||
|     { | ||||
|         using var document = PdfDocument.Open(IntegrationHelpers.GetDocumentPath("Math119FakingData.pdf")); | ||||
|  | ||||
|         var lastPage = document.GetPage(8); | ||||
|  | ||||
|         var words = lastPage.GetWords(); | ||||
|  | ||||
|  | ||||
|     } | ||||
| } | ||||
| @@ -17,6 +17,7 @@ | ||||
|     using Tokenization.Scanner; | ||||
|     using Tokens; | ||||
|     using Operations.TextPositioning; | ||||
|     using Util; | ||||
|     using XObjects; | ||||
|     using static PdfPig.Core.PdfSubpath; | ||||
|  | ||||
| @@ -293,7 +294,34 @@ | ||||
|                     ? currentState.CurrentNonStrokingColor | ||||
|                     : currentState.CurrentStrokingColor; | ||||
|  | ||||
|                 var letter = new Letter(unicode, transformedGlyphBounds, | ||||
|                 Letter letter = null; | ||||
|                 if (Diacritics.IsInCombiningDiacriticRange(unicode) && bytes.CurrentOffset > 0 && letters.Count > 0) | ||||
|                 { | ||||
|                     var attachTo = letters[letters.Count - 1]; | ||||
|  | ||||
|                     if (attachTo.TextSequence == textSequence | ||||
|                         && Diacritics.TryCombineDiacriticWithPreviousLetter(unicode, attachTo.Value, out var newLetter)) | ||||
|                     { | ||||
|                         // TODO: union of bounding boxes. | ||||
|                         letters.Remove(attachTo); | ||||
|  | ||||
|                         letter = new Letter( | ||||
|                             newLetter, | ||||
|                             attachTo.GlyphRectangle, | ||||
|                             attachTo.StartBaseLine, | ||||
|                             attachTo.EndBaseLine, | ||||
|                             attachTo.Width, | ||||
|                             attachTo.FontSize, | ||||
|                             attachTo.Font, | ||||
|                             attachTo.Color, | ||||
|                             attachTo.PointSize, | ||||
|                             attachTo.TextSequence); | ||||
|                     } | ||||
|                     else | ||||
|                     { | ||||
|                         letter = new Letter( | ||||
|                             unicode, | ||||
|                             transformedGlyphBounds, | ||||
|                             transformedPdfBounds.BottomLeft, | ||||
|                             transformedPdfBounds.BottomRight, | ||||
|                             transformedPdfBounds.Width, | ||||
| @@ -302,6 +330,22 @@ | ||||
|                             color, | ||||
|                             pointSize, | ||||
|                             textSequence); | ||||
|                     } | ||||
|                 } | ||||
|                 else | ||||
|                 { | ||||
|                     letter = new Letter( | ||||
|                         unicode, | ||||
|                         transformedGlyphBounds, | ||||
|                         transformedPdfBounds.BottomLeft, | ||||
|                         transformedPdfBounds.BottomRight, | ||||
|                         transformedPdfBounds.Width, | ||||
|                         fontSize, | ||||
|                         font.Details, | ||||
|                         color, | ||||
|                         pointSize, | ||||
|                         textSequence); | ||||
|                 } | ||||
|  | ||||
|                 letters.Add(letter); | ||||
|  | ||||
|   | ||||
							
								
								
									
										75
									
								
								src/UglyToad.PdfPig/Util/Diacritics.cs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										75
									
								
								src/UglyToad.PdfPig/Util/Diacritics.cs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,75 @@ | ||||
| namespace UglyToad.PdfPig.Util | ||||
| { | ||||
|     using System; | ||||
|     using System.Collections.Generic; | ||||
|     using System.Globalization; | ||||
|  | ||||
|     internal static class Diacritics | ||||
|     { | ||||
|         private static readonly HashSet<string> NonCombiningDiacritics = new HashSet<string> | ||||
|         { | ||||
|             "´", | ||||
|             "^", | ||||
|             "ˆ", | ||||
|             "¨", | ||||
|             "©", | ||||
|             "™", | ||||
|             "®", | ||||
|             "`", | ||||
|             "˜", | ||||
|             "∼", | ||||
|             "¸" | ||||
|         }; | ||||
|  | ||||
|         public static bool IsPotentialStandaloneDiacritic(string value) => NonCombiningDiacritics.Contains(value); | ||||
|  | ||||
|         public static bool IsInCombiningDiacriticRange(string value) | ||||
|         { | ||||
|             if (value.Length != 1) | ||||
|             { | ||||
|                 return false; | ||||
|             } | ||||
|  | ||||
|             var intVal = (int)value[0]; | ||||
|  | ||||
|             if (intVal >= 768 && intVal <= 879) | ||||
|             { | ||||
|                 return true; | ||||
|             } | ||||
|  | ||||
|             return false; | ||||
|         } | ||||
|  | ||||
|         public static bool TryCombineDiacriticWithPreviousLetter(string diacritic, string previous, out string result) | ||||
|         { | ||||
|             result = null; | ||||
|  | ||||
|             if (previous == null) | ||||
|             { | ||||
|                 return false; | ||||
|             } | ||||
|  | ||||
|             result = previous + diacritic; | ||||
|  | ||||
|             // On combining the length should remain equal. | ||||
|             var beforeCombination = MeasureDiacriticAwareLength(previous); | ||||
|             var afterCombination = MeasureDiacriticAwareLength(result); | ||||
|  | ||||
|             return beforeCombination == afterCombination; | ||||
|         } | ||||
|  | ||||
|         private static int MeasureDiacriticAwareLength(string input) | ||||
|         { | ||||
|             var length = 0; | ||||
|  | ||||
|             var enumerator = StringInfo.GetTextElementEnumerator(input); | ||||
|             while (enumerator.MoveNext()) | ||||
|             { | ||||
|                 var grapheme = enumerator.GetTextElement(); | ||||
|                 length++; | ||||
|             } | ||||
|  | ||||
|             return length; | ||||
|         } | ||||
|     } | ||||
| } | ||||
		Reference in New Issue
	
	Block a user
	 Eliot Jones
					Eliot Jones