001 /*
002 Copyright (c) 2012, Regents of the University of Colorado
003 All rights reserved.
004
005 Redistribution and use in source and binary forms, with or without modification,
006 are permitted provided that the following conditions are met:
007
008 * Redistributions of source code must retain the above copyright notice, this
009 list of conditions and the following disclaimer.
010
011 * Redistributions in binary form must reproduce the above copyright notice,
012 this list of conditions and the following disclaimer in the documentation
013 and/or other materials provided with the distribution.
014
015 * Neither the name of the University of Colorado nor the names of its
016 contributors may be used to endorse or promote products derived from this
017 software without specific prior written permission.
018
019 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
023 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030
031 package edu.ucdenver.ccp.nlp.biolemmatizer;
032
033 import java.io.BufferedReader;
034 import java.io.IOException;
035 import java.io.InputStream;
036 import java.io.InputStreamReader;
037 import java.util.Arrays;
038 import java.util.HashMap;
039 import java.util.Iterator;
040 import java.util.Map;
041 import java.util.regex.Matcher;
042 import java.util.regex.Pattern;
043
044
045 /**
046 * Takes a String and returns an Americanized version of it based on
047 * a list of British-to-American spellings and some rules.
048 * This is deterministic spelling conversion, and so cannot deal with
049 * certain cases involving complex ambiguities, but it can do most of the
050 * simple case of English to American conversion.
051 * <p>
052 * <i>The list and rules for handling
053 * British-to-American spellings is derived from:</i>
054 * <code>http://www.tysto.com/uk-us-spelling-list.html</code>.
055 *
056 * @author Original author: Christopher Manning; Modified and extended by Haibin Liu
057 */
058 public class Americanize {
059
060 /** No word shorter in length than this is changed by Americanize */
061 private static final int MINIMUM_LENGTH_CHANGED = 4;
062 /** No word shorter in length than this can match a Pattern */
063 private static final int MINIMUM_LENGTH_PATTERN_MATCH = 6;
064 /** mapping from British spelling to American spelling */
065 private Map<String,String> mappingBritishtoAmerican;
066
067 private static final String[] patStrings = { "(ph|an|h|gyn|arch|chim)ae", "haem(at)?o", "aemia$", "([lL])eukaem",
068 "programme(s?)$", "^([a-z]{3,})our(s?)$",
069
070 };
071
072 private static final String[] reps = {
073 "$1e", "hem$1o", "emia", "$1eukem", "program$1", "$1or$2"
074 };
075
076 private static final Pattern[] pats = new Pattern[patStrings.length];
077
078 private static final Pattern disjunctivePattern;
079
080 static {
081 StringBuilder foo = new StringBuilder();
082 for (int i = 0, len = pats.length; i < len; i++) {
083 pats[i] = Pattern.compile(patStrings[i]);
084 if (i > 0) {
085 foo.append('|');
086 }
087 foo.append("(?:");
088 // Remove groups from String before appending for speed
089 foo.append(patStrings[i].replaceAll("[()]", ""));
090 foo.append(')');
091 }
092 disjunctivePattern = Pattern.compile(foo.toString());
093 }
094
095 private static final String[] OUR_EXCEPTIONS = {
096 "abatjour", "beflour", "bonjour",
097 "calambour", "carrefour", "cornflour", "contour",
098 "de[tv]our", "dortour", "dyvour", "downpour",
099 "giaour", "glamour", "holour", "inpour", "outpour",
100 "pandour", "paramour", "pompadour", "recontour", "repour", "ryeflour",
101 "sompnour",
102 "tambour", "troubadour", "tregetour", "velour"
103 };
104
105 private static final Pattern[] excepts = {
106 null, null, null, null, null,
107 Pattern.compile(join(OUR_EXCEPTIONS, "|"))
108 };
109
110
111 /**
112 * Constructor to load the British-to-American spelling mapping file
113 */
114 public Americanize() {
115 //load British to American spelling mapping file
116 String mappingFileName = "BritishToAmerican.mapping";
117 InputStream is = Americanize.class.getResourceAsStream(mappingFileName);
118 try {
119 mappingBritishtoAmerican = loadSpellingMappingFile(is);
120 } catch (IOException e) {
121 throw new RuntimeException("Error while opening mapping file: " + mappingFileName, e);
122 }
123 }
124
125 /**
126 * Static method to load a British-to-American spelling mapping file
127 *
128 * @param is
129 * InputStream of the mapping file
130 * @return a Map object that stores the British-to-American spellings
131 * @throws IOException
132 */
133 private static Map<String, String> loadSpellingMappingFile(InputStream is) throws IOException {
134 Map<String, String> mapping = new HashMap<String, String>();
135 try {
136 InputStreamReader isr = new InputStreamReader(is);
137 BufferedReader input = new BufferedReader(isr);
138
139 String line = null;
140 while ((line = input.readLine()) != null) {
141 line = line.trim();
142 String[] pair = line.split("\t");
143 mapping.put(pair[0], pair[1]);
144 }
145 input.close();
146 isr.close();
147 } finally {
148 is.close();
149 }
150 return mapping;
151 }
152
153 /**
154 * Convert the spelling of a word from British to American English.
155 * This is deterministic spelling conversion, and so cannot deal with
156 * certain cases involving complex ambiguities, but it can do most of the
157 * simple cases of English to American conversion.
158 *
159 * @param str The String to be Americanized
160 * @return The American spelling of the word.
161 */
162 public String americanize(String str) {
163 // No ver short words are changed, so short circuit them
164 int length = str.length();
165 if (length < MINIMUM_LENGTH_CHANGED) {
166 return str;
167 }
168 String result;
169 result = mappingBritishtoAmerican.get(str);
170 if (result != null) {
171 return result;
172 }
173
174 if (length < MINIMUM_LENGTH_PATTERN_MATCH) {
175 return str;
176 }
177 // first do one disjunctive regex and return unless matches. Faster!
178 // (But still allocates matcher each time; avoiding this would make this class not threadsafe....)
179 if ( ! disjunctivePattern.matcher(str).find()) {
180 return str;
181 }
182 for (int i = 0; i < pats.length; i++) {
183 Matcher m = pats[i].matcher(str);
184 if (m.find()) {
185 Pattern ex = excepts[i];
186 if (ex != null) {
187 Matcher me = ex.matcher(str);
188 if (me.find()) {
189 continue;
190 }
191 }
192 // System.err.println("Replacing " + word + " with " +
193 // pats[i].matcher(word).replaceAll(reps[i]));
194 return m.replaceAll(reps[i]);
195 }
196 }
197 return str;
198 }
199
200 /**
201 * static method to concatenate String items with a specified delimiter
202 * @param s
203 * @param delimiter
204 * @return concatenated String items with a specified delimiter
205 */
206 public static final String join(String[] s, String delimiter) {
207 if (s.length == 0) {
208 return "";
209 }
210 Iterator<String> iter = Arrays.asList(s).iterator();
211 StringBuffer buffer = new StringBuffer(iter.next());
212 while (iter.hasNext()) {
213 buffer.append(delimiter).append(iter.next());
214 }
215 return buffer.toString();
216 }
217
218 /**
219 * Americanize and print the command line arguments.
220 * This main method is just for debugging.
221 *
222 * @param args Command line arguments: a list of words
223 */
224 public static void main(String[] args) throws IOException {
225 //System.err.println(new Americanize());
226 //System.err.println();
227 Americanize convert = new Americanize();
228 if (args.length == 0) { // stdin -> stdout:
229 BufferedReader buf = new BufferedReader(new InputStreamReader(System.in));
230 String line;
231 while((line = buf.readLine()) != null) {
232 for(String w : line.split("\\s+")) {
233 System.out.print(convert.americanize(w));
234 System.out.print(' ');
235 }
236 System.out.println();
237 }
238 buf.close();
239 }
240
241 for (String arg : args) {
242 System.out.print(arg);
243 System.out.print(" --> ");
244 System.out.println(convert.americanize(arg));
245 }
246 }
247 }