001 /* 002 Copyright (c) 2012, Regents of the University of Colorado 003 All rights reserved. 004 005 Redistribution and use in source and binary forms, with or without modification, 006 are permitted provided that the following conditions are met: 007 008 * Redistributions of source code must retain the above copyright notice, this 009 list of conditions and the following disclaimer. 010 011 * Redistributions in binary form must reproduce the above copyright notice, 012 this list of conditions and the following disclaimer in the documentation 013 and/or other materials provided with the distribution. 014 015 * Neither the name of the University of Colorado nor the names of its 016 contributors may be used to endorse or promote products derived from this 017 software without specific prior written permission. 018 019 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 020 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 021 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 022 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 023 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 024 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 026 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 027 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 028 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 029 */ 030 031 package edu.ucdenver.ccp.nlp.biolemmatizer; 032 033 import java.io.BufferedReader; 034 import java.io.IOException; 035 import java.io.InputStream; 036 import java.io.InputStreamReader; 037 import java.util.Arrays; 038 import java.util.HashMap; 039 import java.util.Iterator; 040 import java.util.Map; 041 import java.util.regex.Matcher; 042 import java.util.regex.Pattern; 043 044 045 /** 046 * Takes a String and returns an Americanized version of it based on 047 * a list of British-to-American spellings and some rules. 048 * This is deterministic spelling conversion, and so cannot deal with 049 * certain cases involving complex ambiguities, but it can do most of the 050 * simple case of English to American conversion. 051 * <p> 052 * <i>The list and rules for handling 053 * British-to-American spellings is derived from:</i> 054 * <code>http://www.tysto.com/uk-us-spelling-list.html</code>. 055 * 056 * @author Original author: Christopher Manning; Modified and extended by Haibin Liu 057 */ 058 public class Americanize { 059 060 /** No word shorter in length than this is changed by Americanize */ 061 private static final int MINIMUM_LENGTH_CHANGED = 4; 062 /** No word shorter in length than this can match a Pattern */ 063 private static final int MINIMUM_LENGTH_PATTERN_MATCH = 6; 064 /** mapping from British spelling to American spelling */ 065 private Map<String,String> mappingBritishtoAmerican; 066 067 private static final String[] patStrings = { "(ph|an|h|gyn|arch|chim)ae", "haem(at)?o", "aemia$", "([lL])eukaem", 068 "programme(s?)$", "^([a-z]{3,})our(s?)$", 069 070 }; 071 072 private static final String[] reps = { 073 "$1e", "hem$1o", "emia", "$1eukem", "program$1", "$1or$2" 074 }; 075 076 private static final Pattern[] pats = new Pattern[patStrings.length]; 077 078 private static final Pattern disjunctivePattern; 079 080 static { 081 StringBuilder foo = new StringBuilder(); 082 for (int i = 0, len = pats.length; i < len; i++) { 083 pats[i] = Pattern.compile(patStrings[i]); 084 if (i > 0) { 085 foo.append('|'); 086 } 087 foo.append("(?:"); 088 // Remove groups from String before appending for speed 089 foo.append(patStrings[i].replaceAll("[()]", "")); 090 foo.append(')'); 091 } 092 disjunctivePattern = Pattern.compile(foo.toString()); 093 } 094 095 private static final String[] OUR_EXCEPTIONS = { 096 "abatjour", "beflour", "bonjour", 097 "calambour", "carrefour", "cornflour", "contour", 098 "de[tv]our", "dortour", "dyvour", "downpour", 099 "giaour", "glamour", "holour", "inpour", "outpour", 100 "pandour", "paramour", "pompadour", "recontour", "repour", "ryeflour", 101 "sompnour", 102 "tambour", "troubadour", "tregetour", "velour" 103 }; 104 105 private static final Pattern[] excepts = { 106 null, null, null, null, null, 107 Pattern.compile(join(OUR_EXCEPTIONS, "|")) 108 }; 109 110 111 /** 112 * Constructor to load the British-to-American spelling mapping file 113 */ 114 public Americanize() { 115 //load British to American spelling mapping file 116 String mappingFileName = "BritishToAmerican.mapping"; 117 InputStream is = Americanize.class.getResourceAsStream(mappingFileName); 118 try { 119 mappingBritishtoAmerican = loadSpellingMappingFile(is); 120 } catch (IOException e) { 121 throw new RuntimeException("Error while opening mapping file: " + mappingFileName, e); 122 } 123 } 124 125 /** 126 * Static method to load a British-to-American spelling mapping file 127 * 128 * @param is 129 * InputStream of the mapping file 130 * @return a Map object that stores the British-to-American spellings 131 * @throws IOException 132 */ 133 private static Map<String, String> loadSpellingMappingFile(InputStream is) throws IOException { 134 Map<String, String> mapping = new HashMap<String, String>(); 135 try { 136 InputStreamReader isr = new InputStreamReader(is); 137 BufferedReader input = new BufferedReader(isr); 138 139 String line = null; 140 while ((line = input.readLine()) != null) { 141 line = line.trim(); 142 String[] pair = line.split("\t"); 143 mapping.put(pair[0], pair[1]); 144 } 145 input.close(); 146 isr.close(); 147 } finally { 148 is.close(); 149 } 150 return mapping; 151 } 152 153 /** 154 * Convert the spelling of a word from British to American English. 155 * This is deterministic spelling conversion, and so cannot deal with 156 * certain cases involving complex ambiguities, but it can do most of the 157 * simple cases of English to American conversion. 158 * 159 * @param str The String to be Americanized 160 * @return The American spelling of the word. 161 */ 162 public String americanize(String str) { 163 // No ver short words are changed, so short circuit them 164 int length = str.length(); 165 if (length < MINIMUM_LENGTH_CHANGED) { 166 return str; 167 } 168 String result; 169 result = mappingBritishtoAmerican.get(str); 170 if (result != null) { 171 return result; 172 } 173 174 if (length < MINIMUM_LENGTH_PATTERN_MATCH) { 175 return str; 176 } 177 // first do one disjunctive regex and return unless matches. Faster! 178 // (But still allocates matcher each time; avoiding this would make this class not threadsafe....) 179 if ( ! disjunctivePattern.matcher(str).find()) { 180 return str; 181 } 182 for (int i = 0; i < pats.length; i++) { 183 Matcher m = pats[i].matcher(str); 184 if (m.find()) { 185 Pattern ex = excepts[i]; 186 if (ex != null) { 187 Matcher me = ex.matcher(str); 188 if (me.find()) { 189 continue; 190 } 191 } 192 // System.err.println("Replacing " + word + " with " + 193 // pats[i].matcher(word).replaceAll(reps[i])); 194 return m.replaceAll(reps[i]); 195 } 196 } 197 return str; 198 } 199 200 /** 201 * static method to concatenate String items with a specified delimiter 202 * @param s 203 * @param delimiter 204 * @return concatenated String items with a specified delimiter 205 */ 206 public static final String join(String[] s, String delimiter) { 207 if (s.length == 0) { 208 return ""; 209 } 210 Iterator<String> iter = Arrays.asList(s).iterator(); 211 StringBuffer buffer = new StringBuffer(iter.next()); 212 while (iter.hasNext()) { 213 buffer.append(delimiter).append(iter.next()); 214 } 215 return buffer.toString(); 216 } 217 218 /** 219 * Americanize and print the command line arguments. 220 * This main method is just for debugging. 221 * 222 * @param args Command line arguments: a list of words 223 */ 224 public static void main(String[] args) throws IOException { 225 //System.err.println(new Americanize()); 226 //System.err.println(); 227 Americanize convert = new Americanize(); 228 if (args.length == 0) { // stdin -> stdout: 229 BufferedReader buf = new BufferedReader(new InputStreamReader(System.in)); 230 String line; 231 while((line = buf.readLine()) != null) { 232 for(String w : line.split("\\s+")) { 233 System.out.print(convert.americanize(w)); 234 System.out.print(' '); 235 } 236 System.out.println(); 237 } 238 buf.close(); 239 } 240 241 for (String arg : args) { 242 System.out.print(arg); 243 System.out.print(" --> "); 244 System.out.println(convert.americanize(arg)); 245 } 246 } 247 }