1 /* 2 * XNap 3 * 4 * A pure java file sharing client. 5 * 6 * See AUTHORS for copyright information. 7 * 8 * This program is free software; you can redistribute it and/or modify 9 * it under the terms of the GNU General Public License as published by 10 * the Free Software Foundation; either version 2 of the License, or 11 * (at your option) any later version. 12 * 13 * This program is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU General Public License for more details. 17 * 18 * You should have received a copy of the GNU General Public License 19 * along with this program; if not, write to the Free Software 20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 21 * 22 */ 23 24 /* 25 * This file has been adopted for XNap. 26 */ 27 28 /* 29 * Copyright 1998, 1999, 2001 by Gregory L. Guerin. 30 * Terms of use: 31 * - Briefly: OPEN SOURCE under Artistic License -- credit fairly, use freely, 32 * alter carefully. 33 * - Fully: <http://www.amug.org/~glguerin/sw/artistic-license.html> 34 * 35 * This file is from the MacBinary Toolkit for Java: 36 * <http://www.amug.org/~glguerin/sw/#macbinary> 37 */ 38 39 package org.xnap.util; 40 41 import java.util.Hashtable; 42 43 // --- Revision History --- 44 // 23May01 GLG create, though I'm a bit annoyed that I have to 45 // 30May01 GLG minor name changes 46 // 01Jun01 GLG add capital-Y dieresis (\u0178) 47 48 /* 49 * An AccentComposer translates from UniCode's combining accent forms 50 * into composite characters with embedded accents. This is necessary 51 * on Mac OS X and 9, which return the combining accent forms as 52 * filenames from many places, including File.list() and FileDialog. 53 * 54 * <p> The AccentComposer class is both a general-purpose compositer 55 * and a Singleton set of default composition mappings. The Singleton 56 * is initialized with a set of mappings that covers the typical 57 * accents one will see in a Roman or Latin-1 situation on Mac OS 9 or 58 * X. It does not cover all possible cases for all of UniCode. 59 * However, since you can add additional mappings, or replace the 60 * defaults, you can easily control how many or which accents will be 61 * composited. 62 * 63 * <p> The AccentComposer.composeAccents() method is 64 * called by the Mac-specific PathnameFormats defined in 65 * "PathnameFormat.java". In reality, only Mac OS 9 and X need this, 66 * but it's not worth the effort to split out Mac OS into pre-9, 9, 67 * and X sub-versions. At least not yet. 68 * 69 * <p> Note that there is no 70 * AccentDecomposer, and no such capability in this class. This is 71 * unnecessary for my uses, since the file-system itself happily takes 72 * composited characters and turns them into combining form. Good 73 * thing, too, because decomposing accented characters is more 74 * difficult than combining accents and characters together. 75 * 76 * <p>Personally, I think it's lame that Java doesn't have a class in 77 * "java.text" that performs compositing. There's Collator and 78 * different ways of comparing accented Strings, but no way to 79 * actually translate to and from between the composited and combining 80 * forms. 81 * 82 * @author Gregory Guerin 83 */ 84 85 public class AccentComposer 86 { 87 88 private static AccentComposer singleton; 89 90 static 91 { 92 singleton = new AccentComposer(); 93 94 // These initializers cover mainly the ISO Latin-1 accented chars. 95 // I've included capital-Y dieresis (\u0178) because it's in MacRoman, 96 // so likely to appear on Mac OS X. 97 // I've included a few others simply because they're accented vowels. 98 singleton.add('\u0300', // grave ` 99 "AaEeIiOoUu", 100 "\u00C0\u00E0\u00C8\u00E8\u00CC\u00EC\u00D2\u00F2\u00D9\u00F9" ); 101 102 singleton.add( '\u0301', // acute ' 103 "AaEeIiOoUuYy", 104 "\u00C1\u00E1\u00C9\u00E9\u00CD\u00ED\u00D3\u00F3\u00DA\u00FA\u00DD\u00FD" ); 105 106 singleton.add( '\u0302', // circumflex ^ 107 "AaEeIiOoUuYy", 108 "\u00C2\u00E2\u00CA\u00EA\u00CE\u00EE\u00D4\u00F4\u00DB\u00FB\u0176\u0177" ); 109 110 singleton.add( '\u0303', // tilde ~ 111 "AaNnOoUu", 112 "\u00C3\u00E3\u00D1\u00F1\u00D5\u00F5\u0168\u0169" ); 113 114 singleton.add( '\u0308', // umlaut/dieresis (two dots above) 115 "AaEeIiOoUuYy", 116 "\u00C4\u00E4\u00CB\u00EB\u00CF\u00EF\u00D6\u00F6\u00DC\u00FC\u0178\u00FF" ); 117 118 singleton.add( '\u030A', // ring above (as in Angstrom) 119 "Aa", 120 "\u00C5\u00E5" ); 121 122 singleton.add( '\u0327', // cedilla , 123 "Cc", 124 "\u00C7\u00E7" ); 125 126 singleton.add( '\u3099', // voiced sound mark 127 "\u304b\u304d\u304f\u3051\u3053\u3055\u3057\u3059\u305b\u305d\u305f\u3061\u3064\u3066\u3068\u306f\u3072\u3075\u3078\u307b\u3046\u30ab\u30ad\u30af\u30b1\u30b3\u30b5\u30b7\u30b9\u30bb\u30bd\u30bf\u30c1\u30c4\u30c6\u30c8\u30cf\u30d2\u30d5\u30d8\u30db\u30a6", 128 "\u304c\u304e\u3050\u3052\u3054\u3056\u3058\u305a\u305c\u305e\u3060\u3062\u3065\u3067\u3069\u3070\u3073\u3076\u3079\u307c\u3094\u30ac\u30ae\u30b0\u30b2\u30b4\u30b6\u30b8\u30ba\u30bc\u30be\u30c0\u30c2\u30c5\u30c7\u30c9\u30d0\u30d3\u30d6\u30d9\u30dc\u30f4" ); 129 130 singleton.add( '\u309a', // semi-voiced sound mark 131 "\u306f\u3072\u3075\u3078\u307b\u30cf\u30d2\u30d5\u30d8\u30db", 132 "\u3071\u3074\u3077\u307a\u307d\u30d1\u30d4\u30d7\u30da\u30dd" ); 133 134 } 135 136 /* 137 * Return the Singleton AccentComposer that was statically initialized, and 138 * which composeAccents() uses for composing accented characters. 139 */ 140 141 public static AccentComposer getCompositions() 142 { 143 return ( singleton ); 144 } 145 146 /* 147 * Return a String with all combining accents eliminated, and all 148 * accented characters converted into their composite-accent 149 * forms. Only accents known to the Singleton AccentComposer 150 * will be eliminated or composited. Unknown accents are 151 * unaffected. 152 * 153 * <p> If the given String is null, empty, or 154 * contains no known accents, the original String is returned. 155 */ 156 157 public static String composeAccents(String str) 158 { 159 return singleton.compose(str); 160 } 161 162 /* 163 * The String myAccents is an optimization. On the theory that 164 * it's faster and less memory-intensive to scan a String for a 165 * char, myAccents holds the accumulated set of all accent chars 166 * passed to add(). The idea is that compose() can quickly scan a 167 * String with each char it examines, and rapidly determine 168 * whether a composition is necessary or not. This scan is 169 * presumed to be faster than instantiating a Character to key 170 * into a Hashtable and find no element. While it's fairly 171 * obvious that a String scan is faster for small to moderate 172 * String lengths, long Strings (i.e. lots of accents) will cause 173 * the linear scan to dominate. Since this class is not intended 174 * to deal with large numbers of different combining accents, this 175 * approach should not become a bottleneck. 176 */ 177 178 private String myAccents; 179 180 /* 181 * Table of composition mappings. Key is a Character 182 * representing a combining accent. Value is a String[2] 183 * array representing a mapping between an uncombined 184 * character in array[0] and its corresponding composited 185 * character in array[1]. <p> If the accent follows a 186 * character NOT listed in array[0], the accent has no effect, 187 * i.e. the uncomposited char is unaltered and the accent is 188 * discarded. Discarding the accent may not be the best thing 189 * to do, but since the idea is to eliminate combining 190 * accents, it makes sense. The solution is to add a mapping, 191 * not to retain the accent. 192 */ 193 194 private Hashtable myComps; 195 196 /* 197 * Create. 198 */ 199 public AccentComposer() 200 { 201 myAccents = ""; 202 myComps = new Hashtable(); 203 } 204 205 /* 206 * Clear all mappings. 207 */ 208 209 public void clear() 210 { 211 myAccents = ""; 212 myComps.clear(); 213 } 214 215 /* 216 * Add a mapping for the combining accent. Any existing mapping 217 * for the given accent is replaced, so there can be only one 218 * mapping per accent. 219 * 220 * <p> You can't yet remove a mapping. This 221 * is not usually a big problem. 222 * 223 * <p> Any character can be given 224 * as the accent. The pre and post Strings must be non-null and 225 * of equal lengths, or the mapping is ignored. It would be 226 * unwise for either pre or post to contain any combining accent 227 * characters. 228 */ 229 230 public void add( char accent, String pre, String post ) 231 { 232 if ( pre == null || post == null ) 233 return; 234 235 if ( pre.length() != post.length() ) 236 return; 237 // throw new IllegalArgumentException( "Unequal lengths: " + pre + ":" + post ); 238 239 myAccents = myAccents + accent; 240 myComps.put( new Character( accent ), new String[] { pre, post } ); 241 } 242 243 /* 244 * Compose accents in the String. 245 * Safely accomodates null, empty, etc. 246 */ 247 248 public String compose( String str ) 249 { 250 if (str == null || str.length() == 0) 251 return str; 252 253 char[] chars = str.toCharArray(); 254 int before = str.length(); 255 int after = compose( chars, 0, before ); 256 if (before != after) 257 return ( new String( chars, 0, after ) ); 258 else 259 return ( str ); 260 } 261 262 /* 263 * Compose accents in-place in the char[], over the range 264 * starting at 'from' and proceeding for 'count' chars. 265 * The resulting count of composed chars is returned. 266 * 267 * <p> Since a composition always results in two chars being 268 * combined into one, callers can safely assume that if 'count' 269 * and the returned result are identical, no characters were 270 * composed. 271 */ 272 273 public int compose( char[] chars, int offset, int count ) 274 { 275 if ( chars == null || chars.length == 0 || offset < 0 || count <= 0 ) 276 return ( 0 ); 277 278 String knownAccents = accents(); 279 280 // Will need get and put after loop, so declare outside its scope. 281 int get = offset, put = offset; 282 for ( int end = offset + count; get < end; ++get ) { 283 // Optimization: if a char is not in knownAccents, it's a literal char, so put it into place. 284 char each = chars[ get ]; 285 if ( knownAccents.indexOf( each ) < 0 ) { 286 chars[ put++ ] = each; 287 continue; 288 } 289 290 // Getting here, we have an accent char, so get the mapping String-pair. 291 String[] mapping = mapping( each ); 292 293 // Translate the prior char only if it's in the 'pre' String. 294 // If not, the accent char is discarded altogether and the prior char is unchanged. 295 int index = mapping[ 0 ].indexOf( chars[ put - 1 ] ); 296 if ( index >= 0 ) 297 chars[ put - 1 ] = mapping[ 1 ].charAt( index ); 298 299 // At this point, 'put' is never advanced. 300 // If we did a translation, 'put' should not advance because it's already 301 // correctly positioned after the just-translated character. 302 // If we did NOT do a translation, 'put' should not advance because we 303 // want to discard the accent character itself. 304 } 305 306 // Getting here, 'get' and 'put' will either be identical or different. 307 // If identical, then we did no translations. 308 // If different, then we did. 309 // Either way, the returned result is 'put - offset'. 310 return put - offset ; 311 } 312 313 /* 314 * Return a String containing all the accent characters for which 315 * a mapping was add()'ed. 316 */ 317 318 public String accents() 319 { 320 return myAccents; 321 } 322 323 /* 324 * Find a mapping for the given accent character, returning a 325 * String[], or null if no mapping is known. The String[] always 326 * holds exactly two Strings of identical length. The String at 327 * [0] is a sequence of accentable characters, i.e. the unaccented 328 * character forms. The String at [1] is a sequence of composite 329 * accented characters. Each unaccented character in String[0] 330 * has a corresponding accented (composite) character at the same 331 * position in String[1]. 332 */ 333 334 public String[] mapping(char accent) 335 { 336 return (String[])myComps.get(new Character(accent)); 337 } 338 339 }