1 /*
2 * XNap
3 *
4 * A pure java file sharing client.
5 *
6 * See AUTHORS for copyright information.
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 *
22 */
23
24 /*
25 * This file has been adopted for XNap.
26 */
27
28 /*
29 * Copyright 1998, 1999, 2001 by Gregory L. Guerin.
30 * Terms of use:
31 * - Briefly: OPEN SOURCE under Artistic License -- credit fairly, use freely,
32 * alter carefully.
33 * - Fully: <http://www.amug.org/~glguerin/sw/artistic-license.html>
34 *
35 * This file is from the MacBinary Toolkit for Java:
36 * <http://www.amug.org/~glguerin/sw/#macbinary>
37 */
38
39 package org.xnap.util;
40
41 import java.util.Hashtable;
42
43 // --- Revision History ---
44 // 23May01 GLG create, though I'm a bit annoyed that I have to
45 // 30May01 GLG minor name changes
46 // 01Jun01 GLG add capital-Y dieresis (\u0178)
47
48 /*
49 * An AccentComposer translates from UniCode's combining accent forms
50 * into composite characters with embedded accents. This is necessary
51 * on Mac OS X and 9, which return the combining accent forms as
52 * filenames from many places, including File.list() and FileDialog.
53 *
54 * <p> The AccentComposer class is both a general-purpose compositer
55 * and a Singleton set of default composition mappings. The Singleton
56 * is initialized with a set of mappings that covers the typical
57 * accents one will see in a Roman or Latin-1 situation on Mac OS 9 or
58 * X. It does not cover all possible cases for all of UniCode.
59 * However, since you can add additional mappings, or replace the
60 * defaults, you can easily control how many or which accents will be
61 * composited.
62 *
63 * <p> The AccentComposer.composeAccents() method is
64 * called by the Mac-specific PathnameFormats defined in
65 * "PathnameFormat.java". In reality, only Mac OS 9 and X need this,
66 * but it's not worth the effort to split out Mac OS into pre-9, 9,
67 * and X sub-versions. At least not yet.
68 *
69 * <p> Note that there is no
70 * AccentDecomposer, and no such capability in this class. This is
71 * unnecessary for my uses, since the file-system itself happily takes
72 * composited characters and turns them into combining form. Good
73 * thing, too, because decomposing accented characters is more
74 * difficult than combining accents and characters together.
75 *
76 * <p>Personally, I think it's lame that Java doesn't have a class in
77 * "java.text" that performs compositing. There's Collator and
78 * different ways of comparing accented Strings, but no way to
79 * actually translate to and from between the composited and combining
80 * forms.
81 *
82 * @author Gregory Guerin
83 */
84
85 public class AccentComposer
86 {
87
88 private static AccentComposer singleton;
89
90 static
91 {
92 singleton = new AccentComposer();
93
94 // These initializers cover mainly the ISO Latin-1 accented chars.
95 // I've included capital-Y dieresis (\u0178) because it's in MacRoman,
96 // so likely to appear on Mac OS X.
97 // I've included a few others simply because they're accented vowels.
98 singleton.add('\u0300', // grave `
99 "AaEeIiOoUu",
100 "\u00C0\u00E0\u00C8\u00E8\u00CC\u00EC\u00D2\u00F2\u00D9\u00F9" );
101
102 singleton.add( '\u0301', // acute '
103 "AaEeIiOoUuYy",
104 "\u00C1\u00E1\u00C9\u00E9\u00CD\u00ED\u00D3\u00F3\u00DA\u00FA\u00DD\u00FD" );
105
106 singleton.add( '\u0302', // circumflex ^
107 "AaEeIiOoUuYy",
108 "\u00C2\u00E2\u00CA\u00EA\u00CE\u00EE\u00D4\u00F4\u00DB\u00FB\u0176\u0177" );
109
110 singleton.add( '\u0303', // tilde ~
111 "AaNnOoUu",
112 "\u00C3\u00E3\u00D1\u00F1\u00D5\u00F5\u0168\u0169" );
113
114 singleton.add( '\u0308', // umlaut/dieresis (two dots above)
115 "AaEeIiOoUuYy",
116 "\u00C4\u00E4\u00CB\u00EB\u00CF\u00EF\u00D6\u00F6\u00DC\u00FC\u0178\u00FF" );
117
118 singleton.add( '\u030A', // ring above (as in Angstrom)
119 "Aa",
120 "\u00C5\u00E5" );
121
122 singleton.add( '\u0327', // cedilla ,
123 "Cc",
124 "\u00C7\u00E7" );
125
126 singleton.add( '\u3099', // voiced sound mark
127 "\u304b\u304d\u304f\u3051\u3053\u3055\u3057\u3059\u305b\u305d\u305f\u3061\u3064\u3066\u3068\u306f\u3072\u3075\u3078\u307b\u3046\u30ab\u30ad\u30af\u30b1\u30b3\u30b5\u30b7\u30b9\u30bb\u30bd\u30bf\u30c1\u30c4\u30c6\u30c8\u30cf\u30d2\u30d5\u30d8\u30db\u30a6",
128 "\u304c\u304e\u3050\u3052\u3054\u3056\u3058\u305a\u305c\u305e\u3060\u3062\u3065\u3067\u3069\u3070\u3073\u3076\u3079\u307c\u3094\u30ac\u30ae\u30b0\u30b2\u30b4\u30b6\u30b8\u30ba\u30bc\u30be\u30c0\u30c2\u30c5\u30c7\u30c9\u30d0\u30d3\u30d6\u30d9\u30dc\u30f4" );
129
130 singleton.add( '\u309a', // semi-voiced sound mark
131 "\u306f\u3072\u3075\u3078\u307b\u30cf\u30d2\u30d5\u30d8\u30db",
132 "\u3071\u3074\u3077\u307a\u307d\u30d1\u30d4\u30d7\u30da\u30dd" );
133
134 }
135
136 /*
137 * Return the Singleton AccentComposer that was statically initialized, and
138 * which composeAccents() uses for composing accented characters.
139 */
140
141 public static AccentComposer getCompositions()
142 {
143 return ( singleton );
144 }
145
146 /*
147 * Return a String with all combining accents eliminated, and all
148 * accented characters converted into their composite-accent
149 * forms. Only accents known to the Singleton AccentComposer
150 * will be eliminated or composited. Unknown accents are
151 * unaffected.
152 *
153 * <p> If the given String is null, empty, or
154 * contains no known accents, the original String is returned.
155 */
156
157 public static String composeAccents(String str)
158 {
159 return singleton.compose(str);
160 }
161
162 /*
163 * The String myAccents is an optimization. On the theory that
164 * it's faster and less memory-intensive to scan a String for a
165 * char, myAccents holds the accumulated set of all accent chars
166 * passed to add(). The idea is that compose() can quickly scan a
167 * String with each char it examines, and rapidly determine
168 * whether a composition is necessary or not. This scan is
169 * presumed to be faster than instantiating a Character to key
170 * into a Hashtable and find no element. While it's fairly
171 * obvious that a String scan is faster for small to moderate
172 * String lengths, long Strings (i.e. lots of accents) will cause
173 * the linear scan to dominate. Since this class is not intended
174 * to deal with large numbers of different combining accents, this
175 * approach should not become a bottleneck.
176 */
177
178 private String myAccents;
179
180 /*
181 * Table of composition mappings. Key is a Character
182 * representing a combining accent. Value is a String[2]
183 * array representing a mapping between an uncombined
184 * character in array[0] and its corresponding composited
185 * character in array[1]. <p> If the accent follows a
186 * character NOT listed in array[0], the accent has no effect,
187 * i.e. the uncomposited char is unaltered and the accent is
188 * discarded. Discarding the accent may not be the best thing
189 * to do, but since the idea is to eliminate combining
190 * accents, it makes sense. The solution is to add a mapping,
191 * not to retain the accent.
192 */
193
194 private Hashtable myComps;
195
196 /*
197 * Create.
198 */
199 public AccentComposer()
200 {
201 myAccents = "";
202 myComps = new Hashtable();
203 }
204
205 /*
206 * Clear all mappings.
207 */
208
209 public void clear()
210 {
211 myAccents = "";
212 myComps.clear();
213 }
214
215 /*
216 * Add a mapping for the combining accent. Any existing mapping
217 * for the given accent is replaced, so there can be only one
218 * mapping per accent.
219 *
220 * <p> You can't yet remove a mapping. This
221 * is not usually a big problem.
222 *
223 * <p> Any character can be given
224 * as the accent. The pre and post Strings must be non-null and
225 * of equal lengths, or the mapping is ignored. It would be
226 * unwise for either pre or post to contain any combining accent
227 * characters.
228 */
229
230 public void add( char accent, String pre, String post )
231 {
232 if ( pre == null || post == null )
233 return;
234
235 if ( pre.length() != post.length() )
236 return;
237 // throw new IllegalArgumentException( "Unequal lengths: " + pre + ":" + post );
238
239 myAccents = myAccents + accent;
240 myComps.put( new Character( accent ), new String[] { pre, post } );
241 }
242
243 /*
244 * Compose accents in the String.
245 * Safely accomodates null, empty, etc.
246 */
247
248 public String compose( String str )
249 {
250 if (str == null || str.length() == 0)
251 return str;
252
253 char[] chars = str.toCharArray();
254 int before = str.length();
255 int after = compose( chars, 0, before );
256 if (before != after)
257 return ( new String( chars, 0, after ) );
258 else
259 return ( str );
260 }
261
262 /*
263 * Compose accents in-place in the char[], over the range
264 * starting at 'from' and proceeding for 'count' chars.
265 * The resulting count of composed chars is returned.
266 *
267 * <p> Since a composition always results in two chars being
268 * combined into one, callers can safely assume that if 'count'
269 * and the returned result are identical, no characters were
270 * composed.
271 */
272
273 public int compose( char[] chars, int offset, int count )
274 {
275 if ( chars == null || chars.length == 0 || offset < 0 || count <= 0 )
276 return ( 0 );
277
278 String knownAccents = accents();
279
280 // Will need get and put after loop, so declare outside its scope.
281 int get = offset, put = offset;
282 for ( int end = offset + count; get < end; ++get ) {
283 // Optimization: if a char is not in knownAccents, it's a literal char, so put it into place.
284 char each = chars[ get ];
285 if ( knownAccents.indexOf( each ) < 0 ) {
286 chars[ put++ ] = each;
287 continue;
288 }
289
290 // Getting here, we have an accent char, so get the mapping String-pair.
291 String[] mapping = mapping( each );
292
293 // Translate the prior char only if it's in the 'pre' String.
294 // If not, the accent char is discarded altogether and the prior char is unchanged.
295 int index = mapping[ 0 ].indexOf( chars[ put - 1 ] );
296 if ( index >= 0 )
297 chars[ put - 1 ] = mapping[ 1 ].charAt( index );
298
299 // At this point, 'put' is never advanced.
300 // If we did a translation, 'put' should not advance because it's already
301 // correctly positioned after the just-translated character.
302 // If we did NOT do a translation, 'put' should not advance because we
303 // want to discard the accent character itself.
304 }
305
306 // Getting here, 'get' and 'put' will either be identical or different.
307 // If identical, then we did no translations.
308 // If different, then we did.
309 // Either way, the returned result is 'put - offset'.
310 return put - offset ;
311 }
312
313 /*
314 * Return a String containing all the accent characters for which
315 * a mapping was add()'ed.
316 */
317
318 public String accents()
319 {
320 return myAccents;
321 }
322
323 /*
324 * Find a mapping for the given accent character, returning a
325 * String[], or null if no mapping is known. The String[] always
326 * holds exactly two Strings of identical length. The String at
327 * [0] is a sequence of accentable characters, i.e. the unaccented
328 * character forms. The String at [1] is a sequence of composite
329 * accented characters. Each unaccented character in String[0]
330 * has a corresponding accented (composite) character at the same
331 * position in String[1].
332 */
333
334 public String[] mapping(char accent)
335 {
336 return (String[])myComps.get(new Character(accent));
337 }
338
339 }