@@ -45,64 +45,6 @@ public static void mergeTokens(CoreLabel token, CoreLabel nextToken) {
45
45
token .setValue (token .word ()+"-" +token .sentIndex ());
46
46
}
47
47
48
- /**
49
- * Some people write umlauts as two characters instead of just one
50
- *<br>
51
- * German CoreNLP doesn't handle the two character versions correctly,
52
- * so here we condense it into the one character version
53
- */
54
- public static void condenseUmlauts (CoreLabel token ) {
55
- String value = token .value ();
56
- String updatedValue = condenseUmlauts (value );
57
- if (updatedValue != null ) {
58
- token .setValue (updatedValue );
59
- }
60
-
61
- String word = token .word ();
62
- String updatedWord = condenseUmlauts (word );
63
- if (updatedWord != null ) {
64
- token .setWord (updatedWord );
65
- }
66
- }
67
-
68
- public static String condenseUmlauts (String value ) {
69
- StringBuilder ns = null ;
70
- for (int i = 0 ; i < value .length (); ++i ) {
71
- final char cur = value .charAt (i );
72
- if ((int ) cur == 776 ) {
73
- // this is the umlaut character
74
- if (ns == null ) {
75
- ns = new StringBuilder (value .length ());
76
- ns .append (value .substring (0 , i ));
77
- }
78
- final char prev = ns .length () == 0 ? ' ' : ns .charAt (ns .length () - 1 );
79
- if (prev == 'a' ) {
80
- ns .setCharAt (ns .length () - 1 , 'ä' );
81
- } else if (prev == 'A' ) {
82
- ns .setCharAt (ns .length () - 1 , 'Ä' );
83
- } else if (prev == 'o' ) {
84
- ns .setCharAt (ns .length () - 1 , 'ö' );
85
- } else if (prev == 'O' ) {
86
- ns .setCharAt (ns .length () - 1 , 'Ö' );
87
- } else if (prev == 'u' ) {
88
- ns .setCharAt (ns .length () - 1 , 'ü' );
89
- } else if (prev == 'U' ) {
90
- ns .setCharAt (ns .length () - 1 , 'Ü' );
91
- } else {
92
- ns .append (cur );
93
- }
94
- } else {
95
- if (ns != null ) {
96
- ns .append (cur );
97
- }
98
- }
99
- }
100
- if (ns != null ) {
101
- return ns .toString ();
102
- }
103
- return null ;
104
- }
105
-
106
48
@ Override
107
49
public List <CoreLabel > process (List <CoreLabel > tokens ) {
108
50
List <CoreLabel > processedTokens = new ArrayList <CoreLabel >();
@@ -134,9 +76,6 @@ public List<CoreLabel> process(List<CoreLabel> tokens) {
134
76
}
135
77
}
136
78
137
- for (CoreLabel label : processedTokens ) {
138
- condenseUmlauts (label );
139
- }
140
79
return processedTokens ;
141
80
}
142
81
0 commit comments