AccentComposer xref

View Javadoc

1   /*
2    *  XNap
3    *
4    *  A pure java file sharing client.
5    *
6    *  See AUTHORS for copyright information.
7    *
8    *  This program is free software; you can redistribute it and/or modify
9    *  it under the terms of the GNU General Public License as published by
10   *  the Free Software Foundation; either version 2 of the License, or
11   *  (at your option) any later version.
12   *
13   *  This program is distributed in the hope that it will be useful,
14   *  but WITHOUT ANY WARRANTY; without even the implied warranty of
15   *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16   *  GNU General Public License for more details.
17   *
18   *  You should have received a copy of the GNU General Public License
19   *  along with this program; if not, write to the Free Software
20   *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
21   *
22   */
23  
24  /*
25   * This file has been adopted for XNap.
26   */
27  
28  /*
29   * Copyright 1998, 1999, 2001 by Gregory L. Guerin.
30   * Terms of use:
31   *  - Briefly: OPEN SOURCE under Artistic License -- credit fairly, use freely,
32   *    alter carefully.
33   *  - Fully: <http://www.amug.org/~glguerin/sw/artistic-license.html>
34   *
35   * This file is from the MacBinary Toolkit for Java:
36   *   <http://www.amug.org/~glguerin/sw/#macbinary> 
37   */
38  
39  package org.xnap.util;
40  
41  import java.util.Hashtable;
42  
43  // --- Revision History ---
44  // 23May01 GLG  create, though I'm a bit annoyed that I have to
45  // 30May01 GLG  minor name changes
46  // 01Jun01 GLG  add capital-Y dieresis (\u0178)
47  
48  /*
49   * An AccentComposer translates from UniCode's combining accent forms
50   * into composite characters with embedded accents.  This is necessary
51   * on Mac OS X and 9, which return the combining accent forms as
52   * filenames from many places, including File.list() and FileDialog.
53   *
54   * <p> The AccentComposer class is both a general-purpose compositer
55   * and a Singleton set of default composition mappings.  The Singleton
56   * is initialized with a set of mappings that covers the typical
57   * accents one will see in a Roman or Latin-1 situation on Mac OS 9 or
58   * X.  It does not cover all possible cases for all of UniCode.
59   * However, since you can add additional mappings, or replace the
60   * defaults, you can easily control how many or which accents will be
61   * composited.
62   *
63   * <p> The AccentComposer.composeAccents() method is
64   * called by the Mac-specific PathnameFormats defined in
65   * "PathnameFormat.java".  In reality, only Mac OS 9 and X need this,
66   * but it's not worth the effort to split out Mac OS into pre-9, 9,
67   * and X sub-versions.  At least not yet.   
68   * 
69   * <p> Note that there is no
70   * AccentDecomposer, and no such capability in this class.  This is
71   * unnecessary for my uses, since the file-system itself happily takes
72   * composited characters and turns them into combining form.  Good
73   * thing, too, because decomposing accented characters is more
74   * difficult than combining accents and characters together.   
75   *
76   * <p>Personally, I think it's lame that Java doesn't have a class in
77   * "java.text" that performs compositing.  There's Collator and
78   * different ways of comparing accented Strings, but no way to
79   * actually translate to and from between the composited and combining
80   * forms.
81   *
82   * @author Gregory Guerin
83   */
84  
85  public class AccentComposer
86  {
87  
88      private static AccentComposer singleton;
89      
90      static
91      {
92  	singleton = new AccentComposer();
93  	
94  	// These initializers cover mainly the ISO Latin-1 accented chars.
95  	// I've included capital-Y dieresis (\u0178) because it's in MacRoman,
96  	// so likely to appear on Mac OS X.
97  	// I've included a few others simply because they're accented vowels.
98  	singleton.add('\u0300',		// grave `
99  		      "AaEeIiOoUu",
100 		      "\u00C0\u00E0\u00C8\u00E8\u00CC\u00EC\u00D2\u00F2\u00D9\u00F9" );
101 
102 	singleton.add( '\u0301',		// acute '
103 		       "AaEeIiOoUuYy",
104 		       "\u00C1\u00E1\u00C9\u00E9\u00CD\u00ED\u00D3\u00F3\u00DA\u00FA\u00DD\u00FD" );
105 	
106 	singleton.add( '\u0302',		// circumflex ^
107 		       "AaEeIiOoUuYy",
108 		       "\u00C2\u00E2\u00CA\u00EA\u00CE\u00EE\u00D4\u00F4\u00DB\u00FB\u0176\u0177" );
109 	
110 	singleton.add( '\u0303',		// tilde ~
111 		       "AaNnOoUu",
112 		       "\u00C3\u00E3\u00D1\u00F1\u00D5\u00F5\u0168\u0169" );
113 	
114 	singleton.add( '\u0308',		// umlaut/dieresis (two dots above)
115 		       "AaEeIiOoUuYy",
116 		       "\u00C4\u00E4\u00CB\u00EB\u00CF\u00EF\u00D6\u00F6\u00DC\u00FC\u0178\u00FF" );
117 	
118 	singleton.add( '\u030A',		// ring above (as in Angstrom)
119 		       "Aa",
120 		       "\u00C5\u00E5" );
121 	
122 	singleton.add( '\u0327',		// cedilla ,
123 		       "Cc",
124 		       "\u00C7\u00E7" );
125 	
126 	singleton.add( '\u3099',		// voiced sound mark
127 		       "\u304b\u304d\u304f\u3051\u3053\u3055\u3057\u3059\u305b\u305d\u305f\u3061\u3064\u3066\u3068\u306f\u3072\u3075\u3078\u307b\u3046\u30ab\u30ad\u30af\u30b1\u30b3\u30b5\u30b7\u30b9\u30bb\u30bd\u30bf\u30c1\u30c4\u30c6\u30c8\u30cf\u30d2\u30d5\u30d8\u30db\u30a6",
128 		       "\u304c\u304e\u3050\u3052\u3054\u3056\u3058\u305a\u305c\u305e\u3060\u3062\u3065\u3067\u3069\u3070\u3073\u3076\u3079\u307c\u3094\u30ac\u30ae\u30b0\u30b2\u30b4\u30b6\u30b8\u30ba\u30bc\u30be\u30c0\u30c2\u30c5\u30c7\u30c9\u30d0\u30d3\u30d6\u30d9\u30dc\u30f4" );
129 	
130 	singleton.add( '\u309a',		// semi-voiced sound mark
131 		       "\u306f\u3072\u3075\u3078\u307b\u30cf\u30d2\u30d5\u30d8\u30db",
132 		       "\u3071\u3074\u3077\u307a\u307d\u30d1\u30d4\u30d7\u30da\u30dd" );
133 	
134     }
135 
136     /*
137      * Return the Singleton AccentComposer that was statically initialized, and
138      * which composeAccents() uses for composing accented characters.
139      */
140 
141     public static AccentComposer getCompositions()
142     {  
143 	return ( singleton );  
144     }
145 
146     /*
147      * Return a String with all combining accents eliminated, and all
148      * accented characters converted into their composite-accent
149      * forms.  Only accents known to the Singleton AccentComposer
150      * will be eliminated or composited.  Unknown accents are
151      * unaffected.  
152      *
153      * <p> If the given String is null, empty, or
154      * contains no known accents, the original String is returned.
155      */
156 
157     public static String composeAccents(String str)
158     {  
159 	return singleton.compose(str);
160     }
161 
162     /*
163      * The String myAccents is an optimization.  On the theory that
164      * it's faster and less memory-intensive to scan a String for a
165      * char, myAccents holds the accumulated set of all accent chars
166      * passed to add().  The idea is that compose() can quickly scan a
167      * String with each char it examines, and rapidly determine
168      * whether a composition is necessary or not.  This scan is
169      * presumed to be faster than instantiating a Character to key
170      * into a Hashtable and find no element.  While it's fairly
171      * obvious that a String scan is faster for small to moderate
172      * String lengths, long Strings (i.e. lots of accents) will cause
173      * the linear scan to dominate.  Since this class is not intended
174      * to deal with large numbers of different combining accents, this
175      * approach should not become a bottleneck.
176      */
177     
178     private String myAccents;
179 
180     /*
181      * Table of composition mappings.  Key is a Character
182      * representing a combining accent.  Value is a String[2]
183      * array representing a mapping between an uncombined
184      * character in array[0] and its corresponding composited
185      * character in array[1].  <p> If the accent follows a
186      * character NOT listed in array[0], the accent has no effect,
187      * i.e. the uncomposited char is unaltered and the accent is
188      * discarded.  Discarding the accent may not be the best thing
189      * to do, but since the idea is to eliminate combining
190      * accents, it makes sense.  The solution is to add a mapping,
191      * not to retain the accent.
192      */
193     
194     private Hashtable myComps;
195 
196     /*
197      * Create.
198      */
199     public AccentComposer()
200     {
201 	myAccents = "";
202 	myComps = new Hashtable();
203     }
204 
205     /*
206      * Clear all mappings.
207      */
208     
209     public void clear()
210     {
211 	myAccents = "";
212 	myComps.clear();
213     }
214 
215     /*
216      * Add a mapping for the combining accent.  Any existing mapping
217      * for the given accent is replaced, so there can be only one
218      * mapping per accent.
219      *
220      * <p> You can't yet remove a mapping.  This
221      * is not usually a big problem.  
222      *
223      * <p> Any character can be given
224      * as the accent.  The pre and post Strings must be non-null and
225      * of equal lengths, or the mapping is ignored.  It would be
226      * unwise for either pre or post to contain any combining accent
227      * characters.
228      */
229     
230     public void	add( char accent, String pre, String post )
231     {
232 	if ( pre == null  ||  post == null )
233 	    return;
234 	
235 	if ( pre.length() != post.length() )
236 	    return;
237 	//			throw new IllegalArgumentException( "Unequal lengths: " + pre + ":" + post );
238 	
239 	myAccents = myAccents + accent;
240 	myComps.put( new Character( accent ), new String[] { pre, post } );
241     }
242 
243     /*
244      * Compose accents in the String.
245      * Safely accomodates null, empty, etc.
246      */
247     
248     public String compose( String str )
249     {
250 	if (str == null  ||  str.length() == 0)
251 	    return str;
252 
253 	char[] chars = str.toCharArray();
254 	int before = str.length();
255 	int after = compose( chars, 0, before );
256 	if (before != after)
257 	    return ( new String( chars, 0, after ) );
258 	else
259 	    return ( str );
260     }
261     
262     /*
263      * Compose accents in-place in the char[], over the range
264      * starting at 'from' and proceeding for 'count' chars.
265      * The resulting count of composed chars is returned.
266      *
267      * <p> Since a composition always results in two chars being
268      * combined into one, callers can safely assume that if 'count'
269      * and the returned result are identical, no characters were
270      * composed.
271      */
272     
273     public int compose( char[] chars, int offset, int count )
274     {
275 	if ( chars == null  ||  chars.length == 0  ||  offset < 0  ||  count <= 0 )
276 	    return ( 0 );
277 	
278 	String knownAccents = accents();
279 	
280 	// Will need get and put after loop, so declare outside its scope.
281 	int get = offset, put = offset;
282 	for ( int end = offset + count;  get < end;  ++get ) {
283 	    // Optimization: if a char is not in knownAccents, it's a literal char, so put it into place.
284 	    char each = chars[ get ];
285 	    if ( knownAccents.indexOf( each ) < 0 ) {
286 		chars[ put++ ] = each;
287 		continue;
288 	    }
289 	    
290 	    // Getting here, we have an accent char, so get the mapping String-pair.
291 	    String[] mapping = mapping( each );
292 	    
293 	    // Translate the prior char only if it's in the 'pre' String.
294 	    // If not, the accent char is discarded altogether and the prior char is unchanged.
295 	    int index = mapping[ 0 ].indexOf( chars[ put - 1 ] );
296 	    if ( index >= 0 )
297 		chars[ put - 1 ] = mapping[ 1 ].charAt( index );
298 	    
299 	    // At this point, 'put' is never advanced.
300 	    // If we did a translation, 'put' should not advance because it's already
301 	    // correctly positioned after the just-translated character.
302 	    // If we did NOT do a translation, 'put' should not advance because we
303 	    // want to discard the accent character itself.
304 	}
305 
306 	// Getting here, 'get' and 'put' will either be identical or different.
307 	// If identical, then we did no translations.
308 	// If different, then we did.
309 	// Either way, the returned result is 'put - offset'.
310 	return put - offset ;
311     }
312 
313     /*
314      * Return a String containing all the accent characters for which
315      * a mapping was add()'ed.
316      */
317     
318     public String accents()
319     {  
320 	return myAccents;  
321     }
322 
323     /*
324      * Find a mapping for the given accent character, returning a
325      * String[], or null if no mapping is known.  The String[] always
326      * holds exactly two Strings of identical length.  The String at
327      * [0] is a sequence of accentable characters, i.e. the unaccented
328      * character forms.  The String at [1] is a sequence of composite
329      * accented characters.  Each unaccented character in String[0]
330      * has a corresponding accented (composite) character at the same
331      * position in String[1].
332      */
333     
334     public String[] mapping(char accent)
335     {
336 	return (String[])myComps.get(new Character(accent));
337     }
338 
339 }