001    /*
002     Copyright (c) 2012, Regents of the University of Colorado
003     All rights reserved.
004    
005     Redistribution and use in source and binary forms, with or without modification, 
006     are permitted provided that the following conditions are met:
007    
008     * Redistributions of source code must retain the above copyright notice, this 
009        list of conditions and the following disclaimer.
010       
011     * Redistributions in binary form must reproduce the above copyright notice, 
012        this list of conditions and the following disclaimer in the documentation 
013        and/or other materials provided with the distribution.
014       
015     * Neither the name of the University of Colorado nor the names of its 
016        contributors may be used to endorse or promote products derived from this 
017        software without specific prior written permission.
018    
019     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 
020     ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 
021     WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 
022     DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
023     ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 
024     (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 
025     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 
026     ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
027     (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
028     SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029     */
030    
031    package edu.ucdenver.ccp.nlp.biolemmatizer;
032    
033    import java.io.BufferedReader;
034    import java.io.IOException;
035    import java.io.InputStream;
036    import java.io.InputStreamReader;
037    import java.util.Arrays;
038    import java.util.HashMap;
039    import java.util.Iterator;
040    import java.util.Map;
041    import java.util.regex.Matcher;
042    import java.util.regex.Pattern;
043    
044    
045    /**
046     * Takes a String and returns an Americanized version of it based on
047     * a list of British-to-American spellings and some rules.
048     * This is deterministic spelling conversion, and so cannot deal with
049     * certain cases involving complex ambiguities, but it can do most of the
050     * simple case of English to American conversion.
051     * <p>
052     * <i>The list and rules for handling
053     * British-to-American spellings is derived from:</i>
054     * <code>http://www.tysto.com/uk-us-spelling-list.html</code>.
055     *
056     * @author Original author: Christopher Manning; Modified and extended by Haibin Liu
057     */
058    public class Americanize {
059    
060        /** No word shorter in length than this is changed by Americanize */
061        private static final int MINIMUM_LENGTH_CHANGED = 4;
062        /** No word shorter in length than this can match a Pattern */
063        private static final int MINIMUM_LENGTH_PATTERN_MATCH = 6;
064        /** mapping from British spelling to American spelling */
065        private Map<String,String> mappingBritishtoAmerican;
066        
067        private static final String[] patStrings = { "(ph|an|h|gyn|arch|chim)ae", "haem(at)?o", "aemia$", "([lL])eukaem",
068            "programme(s?)$", "^([a-z]{3,})our(s?)$",
069    
070        };
071    
072        private static final String[] reps = {
073            "$1e", "hem$1o", "emia", "$1eukem", "program$1", "$1or$2"
074        };
075    
076        private static final Pattern[] pats = new Pattern[patStrings.length];
077    
078        private static final Pattern disjunctivePattern;
079    
080        static {
081            StringBuilder foo = new StringBuilder();
082            for (int i = 0, len = pats.length; i < len; i++) {
083                    pats[i] = Pattern.compile(patStrings[i]);
084                    if (i > 0) {
085                            foo.append('|');
086                    }
087                    foo.append("(?:");
088                    // Remove groups from String before appending for speed
089                    foo.append(patStrings[i].replaceAll("[()]", ""));
090                    foo.append(')');
091            }
092            disjunctivePattern = Pattern.compile(foo.toString());
093        }
094    
095        private static final String[] OUR_EXCEPTIONS = {
096            "abatjour", "beflour", "bonjour",
097            "calambour", "carrefour", "cornflour", "contour",
098            "de[tv]our", "dortour", "dyvour", "downpour",
099            "giaour", "glamour", "holour", "inpour", "outpour",
100            "pandour", "paramour", "pompadour", "recontour", "repour", "ryeflour",
101            "sompnour",
102            "tambour", "troubadour", "tregetour", "velour"
103        };
104    
105        private static final Pattern[] excepts = {
106            null, null, null, null, null,
107            Pattern.compile(join(OUR_EXCEPTIONS, "|"))
108        };
109    
110    
111        /**
112         * Constructor to load the British-to-American spelling mapping file
113         */
114        public Americanize() {
115            //load British to American spelling mapping file
116            String mappingFileName = "BritishToAmerican.mapping";
117            InputStream is = Americanize.class.getResourceAsStream(mappingFileName);
118            try {
119                    mappingBritishtoAmerican = loadSpellingMappingFile(is);
120            } catch (IOException e) {
121                    throw new RuntimeException("Error while opening mapping file: " + mappingFileName, e);
122            }
123        }
124    
125        /**
126         * Static method to load a British-to-American spelling mapping file
127         * 
128         * @param is
129         *            InputStream of the mapping file
130         * @return a Map object that stores the British-to-American spellings
131         * @throws IOException
132         */
133        private static Map<String, String> loadSpellingMappingFile(InputStream is) throws IOException {
134            Map<String, String> mapping = new HashMap<String, String>();
135            try {
136                    InputStreamReader isr = new InputStreamReader(is);
137                    BufferedReader input = new BufferedReader(isr);
138    
139                    String line = null;
140                    while ((line = input.readLine()) != null) {
141                            line = line.trim();
142                            String[] pair = line.split("\t");
143                            mapping.put(pair[0], pair[1]);
144                    }
145                    input.close();
146                    isr.close();
147            } finally {
148                    is.close();
149            }
150            return mapping; 
151        }
152      
153        /**
154         * Convert the spelling of a word from British to American English.
155         * This is deterministic spelling conversion, and so cannot deal with
156         * certain cases involving complex ambiguities, but it can do most of the
157         * simple cases of English to American conversion.
158         *
159         * @param str The String to be Americanized
160         * @return The American spelling of the word.
161         */
162        public String americanize(String str) {
163            // No ver short words are changed, so short circuit them
164            int length = str.length();
165            if (length < MINIMUM_LENGTH_CHANGED) {
166                    return str;
167            }
168            String result;
169            result = mappingBritishtoAmerican.get(str);
170            if (result != null) {
171                    return result;
172            }
173    
174            if (length < MINIMUM_LENGTH_PATTERN_MATCH) {
175                    return str;
176            }
177            // first do one disjunctive regex and return unless matches. Faster!
178            // (But still allocates matcher each time; avoiding this would make this class not threadsafe....)
179            if ( ! disjunctivePattern.matcher(str).find()) {
180                    return str;
181            }
182            for (int i = 0; i < pats.length; i++) {
183                    Matcher m = pats[i].matcher(str);
184                    if (m.find()) {
185                            Pattern ex = excepts[i];
186                            if (ex != null) {
187                                    Matcher me = ex.matcher(str);
188                                    if (me.find()) {
189                                            continue;
190                                    }
191                            }
192                            // System.err.println("Replacing " + word + " with " +
193                            //             pats[i].matcher(word).replaceAll(reps[i]));
194                            return m.replaceAll(reps[i]);
195                    }
196            }
197            return str;
198        }
199    
200        /**
201         * static method to concatenate String items with a specified delimiter
202         * @param s
203         * @param delimiter
204         * @return concatenated String items with a specified delimiter
205         */
206        public static final String join(String[] s, String delimiter) {
207            if (s.length == 0) {
208                    return "";
209            }
210            Iterator<String> iter = Arrays.asList(s).iterator();
211            StringBuffer buffer = new StringBuffer(iter.next());
212            while (iter.hasNext()) {
213                    buffer.append(delimiter).append(iter.next());
214            }
215            return buffer.toString();
216        }
217    
218        /**
219         * Americanize and print the command line arguments.
220         * This main method is just for debugging.
221         *
222         * @param args Command line arguments: a list of words
223         */
224        public static void main(String[] args) throws IOException {
225            //System.err.println(new Americanize());
226            //System.err.println();
227            Americanize convert = new Americanize();
228            if (args.length == 0) { // stdin -> stdout:
229                    BufferedReader buf = new BufferedReader(new InputStreamReader(System.in));
230                    String line;
231                    while((line = buf.readLine()) != null) {
232                            for(String w : line.split("\\s+")) {
233                                    System.out.print(convert.americanize(w));
234                                    System.out.print(' ');
235                            }
236                            System.out.println();
237                    }
238                    buf.close();
239            }
240    
241            for (String arg : args) {
242                    System.out.print(arg);
243                    System.out.print(" --> ");
244                    System.out.println(convert.americanize(arg));
245            }
246        }
247    }