001    /*
002     Copyright (c) 2012, Regents of the University of Colorado
003     All rights reserved.
004    
005     Redistribution and use in source and binary forms, with or without modification, 
006     are permitted provided that the following conditions are met:
007    
008     * Redistributions of source code must retain the above copyright notice, this 
009        list of conditions and the following disclaimer.
010       
011     * Redistributions in binary form must reproduce the above copyright notice, 
012        this list of conditions and the following disclaimer in the documentation 
013        and/or other materials provided with the distribution.
014       
015     * Neither the name of the University of Colorado nor the names of its 
016        contributors may be used to endorse or promote products derived from this 
017        software without specific prior written permission.
018    
019     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 
020     ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 
021     WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 
022     DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
023     ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 
024     (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 
025     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 
026     ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
027     (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
028     SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029     */
030    
031    package edu.ucdenver.ccp.nlp.biolemmatizer;
032    
033    import java.io.BufferedReader;
034    import java.io.BufferedWriter;
035    import java.io.File;
036    import java.io.FileInputStream;
037    import java.io.FileNotFoundException;
038    import java.io.FileOutputStream;
039    import java.io.IOException;
040    import java.io.InputStream;
041    import java.io.InputStreamReader;
042    import java.io.OutputStreamWriter;
043    import java.nio.charset.Charset;
044    import java.nio.charset.CodingErrorAction;
045    import java.util.ArrayList;
046    import java.util.Arrays;
047    import java.util.Collection;
048    import java.util.HashMap;
049    import java.util.HashSet;
050    import java.util.List;
051    import java.util.Map;
052    import java.util.Set;
053    
054    import org.kohsuke.args4j.CmdLineException;
055    import org.kohsuke.args4j.CmdLineParser;
056    
057    import edu.northwestern.at.utils.corpuslinguistics.lemmatizer.EnglishLemmatizer;
058    import edu.northwestern.at.utils.corpuslinguistics.lemmatizer.Lemmatizer;
059    import edu.northwestern.at.utils.corpuslinguistics.lexicon.DefaultLexicon;
060    import edu.northwestern.at.utils.corpuslinguistics.lexicon.DefaultWordLexicon;
061    import edu.northwestern.at.utils.corpuslinguistics.lexicon.Lexicon;
062    import edu.northwestern.at.utils.corpuslinguistics.lexicon.LexiconEntry;
063    import edu.northwestern.at.utils.corpuslinguistics.partsofspeech.PartOfSpeechTags;
064    import edu.northwestern.at.utils.corpuslinguistics.tokenizer.PennTreebankTokenizer;
065    import edu.northwestern.at.utils.corpuslinguistics.tokenizer.WordTokenizer;
066    
067    /**
068     * BioLemmatizer: Lemmatize a word in biomedical texts and return its lemma; the part of speech
069     * (POS) of the word is optional.
070     * 
071     * <p>
072     * Usage:
073     * </p>
074     * 
075     * <p>
076     * <code>
077     *      java -Xmx1G -jar biolemmatizer-core-1.0-jar-with-dependencies.jar [-l] {@literal <input_string>} [POS tag]   or<br>
078     *  java -Xmx1G -jar biolemmatizer-core-1.0-jar-with-dependencies.jar [-l] -i {@literal <input_file_name> -o <output_file_name>} or<br>
079     *  java -Xmx1G -jar biolemmatizer-core-1.0-jar-with-dependencies.jar [-l] -t<br>
080     *      </code>
081     * </p>
082     * 
083     * <p>
084     * Example:
085     * </p>
086     * 
087     * <p>
088     * <code>
089     *      java -Xmx1G -jar biolemmatizer-core-1.0-jar-with-dependencies.jar catalyses NNS
090     * </code>
091     * </p>
092     * 
093     * <p>
094     * Please see the README file for more usage examples
095     * </p>
096     * 
097     * @author Haibin Liu <Haibin.Liu@ucdenver.edu>, William A Baumgartner Jr
098     *         <William.Baumgartner@ucdenver.edu> and Karin Verspoor <Karin.Verspoor@ucdenver.edu>
099     */
100    
101    public class BioLemmatizer {
102            /** Lemma separator character */
103            public static String lemmaSeparator = "||";
104    
105            /** BioLemmatizer */
106            public Lemmatizer lemmatizer;
107    
108            /** Word lexicon for lemma lookup */
109            public Lexicon wordLexicon;
110    
111            /** NUPOS tags */
112            public PartOfSpeechTags partOfSpeechTags;
113    
114            /** Extract individual word parts from a contracted word. */
115            public WordTokenizer spellingTokenizer;
116    
117            /** Hierachical mapping file from PennPOS to NUPOS */
118            public Map<String, String[]> mappingPennPOStoNUPOS;
119    
120            /** Hierachical mapping file from major class to Penn Treebank POS */
121            public Map<String, String[]> mappingMajorClasstoPennPOS;
122    
123            /** the Part-Of-Speech mapping file */
124            protected static String mappingFileName;
125    
126            /** POSEntry object to retrieve POS tag information */
127            public POSEntry posEntry;
128    
129            /**
130             * Default constructor loads the lexicon from the classpath
131             */
132            public BioLemmatizer() {
133                    this(null);
134            }
135    
136            /**
137             * Constructor to initialize the class fields
138             * 
139             * @param lexiconFile
140             *            a reference to the lexicon file to use. If null, the lexicon that comes with the
141             *            BioLemmatizer distribution is loaded from the classpath
142             */
143            public BioLemmatizer(File lexiconFile) {
144    
145                    // Get the default rule-based lemmatizer.
146                    try {
147                            lemmatizer = new MorphAdornerLemmatizer();
148                    } catch (Exception e) {
149                            throw new RuntimeException(e);
150                    }
151                    // Get default word lexicon.
152                    try {
153                            wordLexicon = new BioWordLexicon(lexiconFile);
154                    } catch (IOException e) {
155                            throw new RuntimeException(e);
156                    }
157                    // Get the part of speech tags from the word lexicon.
158                    partOfSpeechTags = wordLexicon.getPartOfSpeechTags();
159                    // Get spelling tokenizer.
160                    spellingTokenizer = new PennTreebankTokenizer();
161                    // Set the lexicon which may provide lemmata.
162                    lemmatizer.setLexicon(wordLexicon);
163                    // Set the dictionary for checking lemmata after applying lemmatization
164                    // rules.
165                    lemmatizer.setDictionary(setDictionary(wordLexicon));
166    
167                    // Specify the Part-Of-Speech mapping files
168                    mappingFileName = "PennPOStoNUPOS.mapping";
169                    InputStream is = BioLemmatizer.class.getResourceAsStream(mappingFileName);
170                    try {
171                            mappingPennPOStoNUPOS = loadPOSMappingFile(is);
172                    } catch (IOException e) {
173                            throw new RuntimeException("Unable to load mapping: " + mappingFileName, e);
174                    }
175    
176                    mappingFileName = "MajorClasstoPennPOS.mapping";
177                    is = BioLemmatizer.class.getResourceAsStream(mappingFileName);
178                    try {
179                            mappingMajorClasstoPennPOS = loadPOSMappingFile(is);
180                    } catch (IOException e) {
181                            throw new RuntimeException("Unable to load mapping: " + mappingFileName, e);
182                    }
183    
184                    // Get the POS tagsets
185                    posEntry = new POSEntry();
186            }
187    
188            /**
189             * Static method to load a Part-Of-Speech mapping file
190             * 
191             * @param is
192             *            InputStream of the mapping file
193             * @return a Map object that stores the hierachical mapping information in the file
194             * @throws IOException
195             */
196            static Map<String, String[]> loadPOSMappingFile(InputStream is) throws IOException {
197                    Map<String, String[]> mapping = new HashMap<String, String[]>();
198    
199                    try {
200                            InputStreamReader isr = new InputStreamReader(is);
201                            BufferedReader input = new BufferedReader(isr);
202    
203                            String line = null;
204                            while ((line = input.readLine()) != null) {
205                                    line = line.trim();
206                                    String[] pair = line.split("\t");
207                                    String[] mappingSet = pair[1].split(",");
208                                    mapping.put(pair[0], mappingSet);
209                                    // remove the first empty char with unicode FEFF
210                                    mapping.put(pair[0].replaceAll("^\\uFEFF", ""), mappingSet);
211                            }
212    
213                            input.close();
214                            isr.close();
215                    } finally {
216                            is.close();
217                    }
218                    return mapping;
219            }
220    
221            /**
222             * Create a dictionary from a word lexicon for validating lemmata resulted from lemmatization
223             * rules
224             * 
225             * @param wordLexicon
226             *            a word lexicon
227             * @return a set that contains a dictionary generated from the word lexicon
228             */
229            private Set<String> setDictionary(Lexicon wordLexicon) {
230                    Set<String> dictionarySet = new HashSet<String>();
231    
232                    // generate dictionary from lexicon
233                    String[] lexiconEntries = wordLexicon.getEntries();
234                    for (String entry : lexiconEntries) {
235                            String[] lemmata = wordLexicon.getLemmata(entry);
236                            for (String lemma : lemmata) {
237                                    dictionarySet.add(lemma.toLowerCase());
238                            }
239                    }
240    
241                    return dictionarySet;
242            }
243    
244            /**
245             * Retrieve an array of corresponding NUPOS tags of a Penn Treebank POS tag
246             * 
247             * @param partOfSpeech
248             *            a POS tag
249             * @return an array of corresponding NUPOS tags;
250             */
251            private String[] getNUPOSTagFromPennPOS(String partOfSpeech) {
252                    String[] nuPOSTag = mappingPennPOStoNUPOS.get(partOfSpeech.toUpperCase());
253                    return nuPOSTag != null ? nuPOSTag : new String[] { partOfSpeech };
254            }
255    
256            /**
257             * Retrieve an array of corresponding Penn Treebank POS tags of a NUPOS tag
258             * 
259             * @param partOfSpeech
260             *            a POS tag
261             * @return an array of corresponding Penn Treebank POS tags;
262             */
263            private String[] getPennPOSFromNUPOS(String partOfSpeech) {
264                    List<String> result = new ArrayList<String>();
265                    for (String key : mappingPennPOStoNUPOS.keySet()) {
266                            for (String value : mappingPennPOStoNUPOS.get(key)) {
267                                    if (value.equals(partOfSpeech)) {
268                                            result.add(key);
269                                            break;
270                                    }
271                            }
272                    }
273                    return result.size() != 0 ? result.toArray(new String[result.size()]) : new String[] { partOfSpeech };
274            }
275    
276            /**
277             * Retrieve sibling Penn Treebank POS tags of a Penn Treebank POS tag from the POS hierarchy
278             * 
279             * @param partOfSpeech
280             *            a Penn Treebank POS tag
281             * @return sibling Penn Treebank POS tags of the Penn Treebank POS tag
282             */
283            private String[] getSiblingPennPOSTag(String partOfSpeech) {
284                    // check if partOfSpeech exists in the hierarchy
285                    boolean globalFlag = false;
286                    for (String key : mappingMajorClasstoPennPOS.keySet()) {
287                            String[] posTag = mappingMajorClasstoPennPOS.get(key);
288                            for (String pos : posTag) {
289                                    if (pos.equals(partOfSpeech)) {
290                                            globalFlag = true;
291                                            break;
292                                    }
293                            }
294                            if (globalFlag)
295                                    break;
296                    }
297    
298                    if (globalFlag) {
299                            String foundKey = "";
300                            for (String key : mappingMajorClasstoPennPOS.keySet()) {
301                                    String[] posTag = mappingMajorClasstoPennPOS.get(key);
302                                    boolean localFlag = false;
303                                    for (String pos : posTag) {
304                                            if (pos.equals(partOfSpeech)) {
305                                                    foundKey = key;
306                                                    localFlag = true;
307                                                    break;
308                                            }
309                                    }
310                                    if (localFlag)
311                                            break;
312                            }
313                            List<String> merge = new ArrayList<String>();
314                            for (String pos : mappingMajorClasstoPennPOS.get(foundKey)) {
315                                    if (!pos.equals(partOfSpeech))
316                                            merge.add(pos);
317                            }
318                            return merge.toArray(new String[merge.size()]);
319                    } else {
320                            return new String[] { partOfSpeech };
321                    }
322            }
323    
324            /**
325             * Retrieve sibling NUPOS tags of a Penn Treebank POS tag from the POS hierarchy
326             * 
327             * @param partOfSpeech
328             *            a Penn Treebank POS tag
329             * @return sibling NUPOS tags of the Penn Treebank POS tag
330             */
331            private String[] getSiblingNUPOSTag(String partOfSpeech) {
332                    // check if partOfSpeech exists in the hierarchy
333                    boolean globalFlag = false;
334                    for (String key : mappingMajorClasstoPennPOS.keySet()) {
335                            String[] posTag = mappingMajorClasstoPennPOS.get(key);
336                            for (String pos : posTag) {
337                                    if (pos.equals(partOfSpeech)) {
338                                            globalFlag = true;
339                                            break;
340                                    }
341                            }
342                            if (globalFlag)
343                                    break;
344                    }
345    
346                    if (globalFlag) {
347                            String foundKey = "";
348                            for (String key : mappingMajorClasstoPennPOS.keySet()) {
349                                    String[] posTag = mappingMajorClasstoPennPOS.get(key);
350                                    boolean localFlag = false;
351                                    for (String pos : posTag) {
352                                            if (pos.equals(partOfSpeech)) {
353                                                    foundKey = key;
354                                                    localFlag = true;
355                                                    break;
356                                            }
357                                    }
358                                    if (localFlag)
359                                            break;
360                            }
361                            List<String> merge = new ArrayList<String>();
362                            for (String pos : mappingMajorClasstoPennPOS.get(foundKey)) {
363                                    if (!pos.equals(partOfSpeech)) {
364                                            merge.addAll(Arrays.asList(mappingPennPOStoNUPOS.get(pos.toUpperCase())));
365                                    }
366                            }
367    
368                            return merge.toArray(new String[merge.size()]);
369                    } else {
370                            return new String[] { partOfSpeech };
371                    }
372            }
373    
374            /**
375             * Retrieve lemmas and the corresponding categories of the input string
376             * 
377             * @param spelling
378             *            an input string
379             * @return a Map object that stores lemmas and categories of the string; key: category, value:
380             *         lemma
381             */
382            private Map<String, String> getLemmasAndCategories(String spelling) {
383                    Map<String, String> lemmasAndCategories = new HashMap<String, String>();
384                    LexiconEntry lexiconEntry = wordLexicon.getLexiconEntry(spelling);
385                    if (lexiconEntry != null)
386                            lemmasAndCategories = lexiconEntry.lemmata;
387                    return lemmasAndCategories;
388            }
389    
390            /**
391             * Clean up the raw lemma resulted from lemmatization rules
392             * 
393             * @param lemma
394             *            a raw lemma
395             * @return clean lemma
396             */
397            private static String cleanUpLemma(String lemma) {
398                    String newLemma = lemma;
399                    String lastChar = lemma.substring(lemma.length() - 1);
400                    if (lastChar.equals("'")) {
401                            newLemma = lemma.substring(0, lemma.length() - 1);
402                    }
403                    return newLemma;
404            }
405    
406            /**
407             * Convert special unicode characters into modern English spelling
408             * 
409             * @param input
410             *            an input string
411             * @return modern English spelling
412             */
413             static String unicodeHandler(String input) {
414                    // define the mapping between special unicode characters and modern
415                    // English spelling
416                    Map<String, String> specialUnicodeCharToModernEnglishMapping = new HashMap<String, String>();
417    
418                    specialUnicodeCharToModernEnglishMapping.put("u00E6", "ae");
419                    specialUnicodeCharToModernEnglishMapping.put("u0153", "oe");
420                    specialUnicodeCharToModernEnglishMapping.put("u00E4", "a");
421                    specialUnicodeCharToModernEnglishMapping.put("u00E0", "a");
422                    specialUnicodeCharToModernEnglishMapping.put("u00E1", "a");
423                    specialUnicodeCharToModernEnglishMapping.put("u0113", "e");
424                    specialUnicodeCharToModernEnglishMapping.put("u00E9", "e");
425                    specialUnicodeCharToModernEnglishMapping.put("u00E8", "e");
426                    specialUnicodeCharToModernEnglishMapping.put("u00EB", "e");
427                    specialUnicodeCharToModernEnglishMapping.put("u00EF", "i");
428                    specialUnicodeCharToModernEnglishMapping.put("u00F1", "n");
429                    specialUnicodeCharToModernEnglishMapping.put("u014D", "o");
430                    specialUnicodeCharToModernEnglishMapping.put("u00F6", "o");
431                    specialUnicodeCharToModernEnglishMapping.put("u00F4", "o");
432                    specialUnicodeCharToModernEnglishMapping.put("u016B", "u");
433                    specialUnicodeCharToModernEnglishMapping.put("u00FA", "u");
434    
435                    String output = input;
436                    for (String unicode : specialUnicodeCharToModernEnglishMapping.keySet()) {
437                            String regex = "\\" + unicode;
438                            output = output.replaceAll(regex, specialUnicodeCharToModernEnglishMapping.get(unicode));
439                    }
440    
441                    return output;
442            }
443    
444            /**
445             * Lemmatize a string with POS tag using Lexicon only
446             * 
447             * @param spelling
448             *            an input string
449             * @param partOfSpeech
450             *            POS tag of the input string
451             * @return a LemmaEntry object containing lemma and POS information
452             */
453            public LemmataEntry lemmatizeByLexicon(String spelling, String partOfSpeech) {
454                    Map<String, String> lemmataAndLemmataTag = new HashMap<String, String>();
455                    String lemmata = spelling;
456                    String lemmataTag;
457                    if (partOfSpeech == null)
458                            partOfSpeech = "";
459                    // default POS tag = NONE
460                    if (partOfSpeech.trim().length() == 0)
461                            lemmataTag = "NONE";
462                    else
463                            lemmataTag = partOfSpeech;
464    
465                    // check the POS tagset
466                    String tagSetLabel = posEntry.getTagSetLabel(partOfSpeech);
467    
468                    String[] nuPOSTag = getNUPOSTagFromPennPOS(partOfSpeech);
469    
470                    // Different lexicon search methods are tried in order to
471                    // augument the use of
472                    // lexicon
473                    String lemma = "*";
474                    String category = "*";
475    
476                    if (tagSetLabel.equals("PennPOS")) {
477                            // direct PennPOS tag search
478                            lemma = wordLexicon.getLemma(spelling.toLowerCase(), partOfSpeech);
479                            if (lemma.equals("*")) {
480                                    lemma = wordLexicon.getLemma(spelling.toUpperCase(), partOfSpeech);
481                            }
482                            if (!lemma.equals("*")) {
483                                    lemmata = lemma;
484                                    category = partOfSpeech;
485                                    // System.out.println("found in the Penn direct lexicon: "+lemma);
486                            }
487                            // PennPOS tag hierachical search
488                            if (lemma.equals("*")) {
489                                    String[] hierarachicalPennPOSTag = getSiblingPennPOSTag(partOfSpeech);
490                                    for (String pos : hierarachicalPennPOSTag) {
491                                            lemma = wordLexicon.getLemma(spelling.toLowerCase(), pos);
492                                            if (lemma.equals("*")) {
493                                                    lemma = wordLexicon.getLemma(spelling.toUpperCase(), pos);
494                                            }
495                                            if (!lemma.equals("*")) {
496                                                    lemmata = lemma;
497                                                    category = pos;
498                                                    // System.out.println("found in the Penn hierachical lexicon: "+lemma);
499                                                    break;
500                                            }
501                                    }
502                            }
503                            // Turn PennPOS tag into NUSPOS tag and search
504                            if (lemma.equals("*")) {
505                                    for (String pos : nuPOSTag) {
506                                            lemma = wordLexicon.getLemma(spelling.toLowerCase(), pos);
507                                            if (lemma.equals("*")) {
508                                                    lemma = wordLexicon.getLemma(spelling.toUpperCase(), pos);
509                                            }
510                                            if (!lemma.equals("*")) {
511                                                    lemmata = lemma;
512                                                    category = pos;
513                                                    // System.out.println("found in the converted NU direct lexicon: "+lemma);
514                                                    break;
515                                            }
516                                    }
517                            }
518                            // NUSPOS tag hierachical search
519                            if (lemma.equals("*")) {
520                                    String[] hierarachicalNUPOSTag = getSiblingNUPOSTag(partOfSpeech);
521                                    for (String pos : hierarachicalNUPOSTag) {
522                                            lemma = wordLexicon.getLemma(spelling.toLowerCase(), pos);
523                                            if (lemma.equals("*")) {
524                                                    lemma = wordLexicon.getLemma(spelling.toUpperCase(), pos);
525                                            }
526                                            if (!lemma.equals("*")) {
527                                                    lemmata = lemma;
528                                                    category = pos;
529                                                    // System.out.println("found in the converted NU hierachical lexicon: "+lemma);
530                                                    break;
531                                            }
532                                    }
533                            }
534                    } else if (tagSetLabel.equals("NUPOS")) {
535                            // direct NUPOS tag search
536                            lemma = wordLexicon.getLemma(spelling.toLowerCase(), partOfSpeech);
537                            if (lemma.equals("*")) {
538                                    lemma = wordLexicon.getLemma(spelling.toUpperCase(), partOfSpeech);
539                            }
540                            if (!lemma.equals("*")) {
541                                    lemmata = lemma;
542                                    category = partOfSpeech;
543                                    // System.out.println("found in the NU direct lexicon: "+lemma);
544                            }
545                            // NUPOS tag hierachical search
546                            if (lemma.equals("*")) {
547                                    String[] hierarachicalNUPOSTag = getSiblingNUPOSTag(getPennPOSFromNUPOS(partOfSpeech)[0]);
548                                    for (String pos : hierarachicalNUPOSTag) {
549                                            lemma = wordLexicon.getLemma(spelling.toLowerCase(), pos);
550                                            if (lemma.equals("*")) {
551                                                    lemma = wordLexicon.getLemma(spelling.toUpperCase(), pos);
552                                            }
553                                            if (!lemma.equals("*")) {
554                                                    lemmata = lemma;
555                                                    category = pos;
556                                                    // System.out.println("found in the NU hierachical lexicon: "+lemma);
557                                                    break;
558                                            }
559                                    }
560                            }
561                    }
562    
563                    // backup lexicon lookup process: search without POS tags, return all
564                    // lemmas
565                    Map<String, String> lemmasAndCategories = new HashMap<String, String>();
566                    if (tagSetLabel.equals("NONE") || lemma.equals("*")) {
567                            // if ( tagSetLabel.equals("NONE") ) {
568                            lemmasAndCategories = getLemmasAndCategories(spelling.toLowerCase());
569                            if (lemmasAndCategories.isEmpty()) {
570                                    lemmasAndCategories = getLemmasAndCategories(spelling.toUpperCase());
571                            }
572                    }
573    
574                    // found the Lemma
575                    if (!lemmasAndCategories.isEmpty()) {
576                            lemmataAndLemmataTag = lemmasAndCategories;
577                            // System.out.println("found in the lexicon");
578                    } else if (!lemma.equals("*")) {
579                            lemmata = lemma;
580                            lemmataTag = category;
581                            lemmataAndLemmataTag.put(lemmataTag, lemmata);
582                    }
583                    // lexicon has been checked but nothing found, return original input
584                    else
585                            lemmataAndLemmataTag.put(lemmataTag, lemmata);
586    
587                    return new LemmataEntry(lemmataAndLemmataTag, posEntry);
588            }
589    
590            /**
591             * Lemmatize a string with POS tag using lemmatization rules only
592             * 
593             * @param spelling
594             *            an input string
595             * @param partOfSpeech
596             *            POS tag of the input string
597             * @return a LemmaEntry object containing lemma and POS information
598             */
599            public LemmataEntry lemmatizeByRules(String spelling, String partOfSpeech) {
600                    // option to have a dictionary for rule-based lemmatizer to validate results
601                    // lemmatizer.setDictionary(new HashSet<String>());
602    
603                    Map<String, String> lemmataAndLemmataTag = new HashMap<String, String>();
604                    String lemmata = spelling;
605                    String lemmataTag;
606                    // default POS tag = NONE
607                    if (partOfSpeech == null)
608                            partOfSpeech = "";
609                    if (partOfSpeech.trim().length() == 0)
610                            lemmataTag = "NONE";
611                    else
612                            lemmataTag = partOfSpeech;
613    
614                    String[] nuPOSTag = getNUPOSTagFromPennPOS(partOfSpeech);
615    
616                    // Use rule-based lemmatizer.
617    
618                    // Get lemmatization word class for part of speech,
619                    String lemmaClass = "";
620                    for (String pos : nuPOSTag) {
621                            lemmaClass = partOfSpeechTags.getLemmaWordClass(pos);
622                            if (lemmaClass.length() != 0) {
623                                    break;
624                            }
625                    }
626    
627                    // Do not lemmatize words which should not be lemmatized, ?including
628                    // proper names?.
629                    if (lemmatizer.cantLemmatize(spelling) || lemmaClass.equals("none")) {
630                    } else {
631                            // Try compound word exceptions list first.
632                            lemmata = lemmatizer.lemmatize(spelling, "compound");
633    
634                            // If lemma not found, keep trying.
635                            if (lemmata.equalsIgnoreCase(spelling)) {
636                                    // Extract individual word parts.
637                                    // May be more than one for a
638                                    // contraction.
639                                    List<String> wordList = spellingTokenizer.extractWords(spelling);
640    
641                                    // If just one word part, get its lemma.
642                                    if (!partOfSpeechTags.isCompoundTag(partOfSpeech) || (wordList.size() == 1)) {
643                                            if (lemmaClass.length() == 0) {
644                                                    lemmata = lemmatizer.lemmatize(spelling);
645                                            } else {
646                                                    lemmata = lemmatizer.lemmatize(spelling, lemmaClass);
647                                            }
648                                    }
649                                    // More than one word part.
650                                    // Get lemma for each part and
651                                    // concatenate them with the
652                                    // lemma separator to form a
653                                    // compound lemma.
654                                    else {
655                                            lemmata = "";
656                                            String[] posTags = partOfSpeechTags.splitTag(partOfSpeech);
657    
658                                            if (posTags.length == wordList.size()) {
659                                                    for (int i = 0; i < wordList.size(); i++) {
660                                                            String wordPiece = wordList.get(i);
661                                                            if (i > 0) {
662                                                                    lemmata = lemmata + lemmaSeparator;
663                                                            }
664    
665                                                            LemmataEntry lemmaPiece = lemmatizeByRules(wordPiece, posTags[i]);
666    
667                                                            lemmata = lemmata + lemmaPiece.lemmasToString();
668                                                    }
669                                            }
670                                    }
671                            }
672                    }
673    
674                    lemmataAndLemmataTag.put(lemmataTag, lemmata);
675    
676                    return new LemmataEntry(lemmataAndLemmataTag, posEntry);
677    
678            }
679    
680            /**
681             * Lemmatize a string with POS tag using both lexicon lookup and lemmatization rules This is the
682             * preferred method as it gives the best lemmatization performance
683             * 
684             * @param spelling
685             *            an input string
686             * @param partOfSpeech
687             *            POS tag of the input string
688             * @return a LemmaEntry object containing lemma and POS information
689             */
690            public LemmataEntry lemmatizeByLexiconAndRules(String spelling, String partOfSpeech) {
691    
692                    Map<String, String> lemmataAndLemmataTag = new HashMap<String, String>();
693                    String lemmata = spelling;
694                    String lemmataTag;
695                    // default POS tag = NONE
696                    if (partOfSpeech == null)
697                            partOfSpeech = "";
698                    if (partOfSpeech.trim().length() == 0)
699                            lemmataTag = "NONE";
700                    else
701                            lemmataTag = partOfSpeech;
702    
703                    // check the POS tagset
704                    String tagSetLabel = posEntry.getTagSetLabel(partOfSpeech);
705    
706                    String[] nuPOSTag = getNUPOSTagFromPennPOS(partOfSpeech);
707    
708                    // Try lexicon first, different search methods are tried in order to
709                    // augument the use of
710                    // lexicon
711                    String lemma = "*";
712                    String category = "*";
713    
714                    if (tagSetLabel.equals("PennPOS")) {
715                            // direct PennPOS tag search
716                            lemma = wordLexicon.getLemma(spelling.toLowerCase(), partOfSpeech);
717                            if (lemma.equals("*")) {
718                                    lemma = wordLexicon.getLemma(spelling.toUpperCase(), partOfSpeech);
719                            }
720                            if (!lemma.equals("*")) {
721                                    category = partOfSpeech;
722                                    // System.out.println("found in the Penn direct lexicon: "+lemma);
723                            }
724                            // PennPOS tag hierachical search
725                            if (lemma.equals("*")) {
726                                    String[] hierarachicalPennPOSTag = getSiblingPennPOSTag(partOfSpeech);
727                                    for (String pos : hierarachicalPennPOSTag) {
728                                            lemma = wordLexicon.getLemma(spelling.toLowerCase(), pos);
729                                            if (lemma.equals("*")) {
730                                                    lemma = wordLexicon.getLemma(spelling.toUpperCase(), pos);
731                                            }
732                                            if (!lemma.equals("*")) {
733                                                    category = pos;
734                                                    // System.out.println("found in the Penn hierachical lexicon: "+lemma);
735                                                    break;
736                                            }
737                                    }
738                            }
739                            // Turn PennPOS tag into NUSPOS tag and search
740                            if (lemma.equals("*")) {
741                                    for (String pos : nuPOSTag) {
742                                            lemma = wordLexicon.getLemma(spelling.toLowerCase(), pos);
743                                            if (lemma.equals("*")) {
744                                                    lemma = wordLexicon.getLemma(spelling.toUpperCase(), pos);
745                                            }
746                                            if (!lemma.equals("*")) {
747                                                    category = pos;
748                                                    // System.out.println("found in the converted NU direct lexicon: "+lemma);
749                                                    break;
750                                            }
751                                    }
752                            }
753                            // NUSPOS tag hierachical search
754                            if (lemma.equals("*")) {
755                                    String[] hierarachicalNUPOSTag = getSiblingNUPOSTag(partOfSpeech);
756                                    for (String pos : hierarachicalNUPOSTag) {
757                                            lemma = wordLexicon.getLemma(spelling.toLowerCase(), pos);
758                                            if (lemma.equals("*")) {
759                                                    lemma = wordLexicon.getLemma(spelling.toUpperCase(), pos);
760                                            }
761                                            if (!lemma.equals("*")) {
762                                                    category = pos;
763                                                    // System.out.println("found in the converted NU hierachical lexicon: "+lemma);
764                                                    break;
765                                            }
766                                    }
767                            }
768                    } else if (tagSetLabel.equals("NUPOS")) {
769                            // direct NUPOS tag search
770                            lemma = wordLexicon.getLemma(spelling.toLowerCase(), partOfSpeech);
771                            if (lemma.equals("*")) {
772                                    lemma = wordLexicon.getLemma(spelling.toUpperCase(), partOfSpeech);
773                            }
774                            if (!lemma.equals("*")) {
775                                    category = partOfSpeech;
776                                    // System.out.println("found in the NU direct lexicon: "+lemma);
777                            }
778                            // NUPOS tag hierachical search
779                            if (lemma.equals("*")) {
780                                    String[] hierarachicalNUPOSTag = getSiblingNUPOSTag(getPennPOSFromNUPOS(partOfSpeech)[0]);
781                                    for (String pos : hierarachicalNUPOSTag) {
782                                            lemma = wordLexicon.getLemma(spelling.toLowerCase(), pos);
783                                            if (lemma.equals("*")) {
784                                                    lemma = wordLexicon.getLemma(spelling.toUpperCase(), pos);
785                                            }
786                                            if (!lemma.equals("*")) {
787                                                    category = pos;
788                                                    // System.out.println("found in the NU hierachical lexicon: "+lemma);
789                                                    break;
790                                            }
791                                    }
792                            }
793                    }
794    
795                    // if tagSetLabel is NONE, invoke the backup lexicon lookup process:
796                    // search without POS tags, return all lemmas
797                    Map<String, String> lemmasAndCategories = new HashMap<String, String>();
798                    // if ( tagSetLabel.equals("NONE") || lemma.equals( "*" ) ) {
799                    if (tagSetLabel.equals("NONE")) {
800                            lemmasAndCategories = getLemmasAndCategories(spelling.toLowerCase());
801                            if (lemmasAndCategories.isEmpty()) {
802                                    lemmasAndCategories = getLemmasAndCategories(spelling.toUpperCase());
803                            }
804                    }
805    
806                    // found the Lemma
807                    if (!lemmasAndCategories.isEmpty()) {
808                            lemmataAndLemmataTag = lemmasAndCategories;
809                            // System.out.println("found in the lexicon");
810                    } else if (!lemma.equals("*")) {
811                            lemmata = lemma;
812                            lemmataTag = category;
813                            lemmataAndLemmataTag.put(lemmataTag, lemmata);
814                    }
815                    // for testing purpose to test lexicon only
816                    // else lemmataAndLemmataTag.put(lemmataTag, lemmata) ;
817    
818                    // Lemma not found in word lexicon. Use rule-based lemmatizer.
819                    else {
820    
821                            // Get lemmatization word class for part of speech,
822                            String lemmaClass = "";
823                            for (String pos : nuPOSTag) {
824                                    lemmaClass = partOfSpeechTags.getLemmaWordClass(pos);
825                                    if (lemmaClass.length() != 0) {
826                                            break;
827                                    }
828                            }
829    
830                            // Do not lemmatize words which should not be lemmatized, ?including
831                            // proper names?.
832                            if (lemmatizer.cantLemmatize(spelling) || lemmaClass.equals("none")) {
833                            } else {
834                                    // Try compound word exceptions list first.
835                                    lemmata = lemmatizer.lemmatize(spelling, "compound");
836    
837                                    // If lemma not found, keep trying.
838                                    if (lemmata.equalsIgnoreCase(spelling)) {
839                                            // Extract individual word parts.
840                                            // May be more than one for a
841                                            // contraction.
842                                            List<String> wordList = spellingTokenizer.extractWords(spelling);
843    
844                                            // If just one word part, get its lemma.
845                                            if (!partOfSpeechTags.isCompoundTag(partOfSpeech) || (wordList.size() == 1)) {
846                                                    if (lemmaClass.length() == 0) {
847                                                            lemmata = lemmatizer.lemmatize(spelling);
848                                                    } else {
849                                                            lemmata = lemmatizer.lemmatize(spelling, lemmaClass);
850                                                    }
851                                            }
852                                            // More than one word part.
853                                            // Get lemma for each part and
854                                            // concatenate them with the
855                                            // lemma separator to form a
856                                            // compound lemma.
857                                            else {
858                                                    lemmata = "";
859                                                    String[] posTags = partOfSpeechTags.splitTag(partOfSpeech);
860    
861                                                    if (posTags.length == wordList.size()) {
862                                                            for (int i = 0; i < wordList.size(); i++) {
863                                                                    String wordPiece = wordList.get(i);
864                                                                    if (i > 0) {
865                                                                            lemmata = lemmata + lemmaSeparator;
866                                                                    }
867    
868                                                                    LemmataEntry lemmaPiece = lemmatizeByLexiconAndRules(wordPiece, posTags[i]);
869    
870                                                                    lemmata = lemmata + lemmaPiece.lemmasToString();
871                                                            }
872                                                    }
873                                            }
874                                    }
875                            }
876    
877                            lemmataAndLemmataTag.put(lemmataTag, lemmata);
878                    }
879    
880                    return new LemmataEntry(lemmataAndLemmataTag, posEntry);
881            }
882    
883            /**
884             * Input arguments are parsed into a {@link BioLemmatizerCmdOpts} object. Valid input arguments
885             * include:
886             * 
887             * <pre>
888             *  VAL    : Single input to be lemmatized
889             *  VAL    : Part of speech of the single input to be lemmatized
890             *  -f VAL : optional path to a lexicon file. If not set, the default lexicon 
891             *           available on the classpath is used
892             *  -i VAL : the path to the input file
893             *  -l     : if present, only the lemma is returned (part-of-speech information is 
894             *           suppressed)
895             *  -o VAL : the path to the output file
896             *  -t     : if present, the interactive mode is used
897             * </pre>
898             * 
899             * 
900             * 
901             * @param args
902             */
903            public static void main(String[] args) {
904    
905                    BioLemmatizerCmdOpts options = new BioLemmatizerCmdOpts();
906                    CmdLineParser parser = new CmdLineParser(options);
907                    try {
908                            parser.parseArgument(args);
909                    } catch (CmdLineException e) {
910                            System.err.println(e.getMessage());
911                            parser.printUsage(System.err);
912                            return;
913                    }
914    
915                    File lexiconFile = options.getLexiconFile();
916                    BioLemmatizer bioLemmatizer = new BioLemmatizer(lexiconFile);
917                    boolean americanize = options.americanizedLemma();
918                    boolean outputLemmaOnly = options.outputLemmaOnly();
919                    boolean useInteractiveMode = options.useInteractiveMode();
920                    String inputStr = options.getInputStr();
921                    if (inputStr != null)
922                            inputStr = inputStr.trim();
923                    String inputStrPos = options.getInputStrPos();
924                    File inputFile = options.getInputFile();
925                    File outputFile = options.getOutputFile();
926                    System.out.println("=========================================================");
927                    System.out.println("=========================================================");
928                    System.out.println("=========================================================");
929                    System.out.println("Running BioLemmatizer....");
930                    try {
931                            if (useInteractiveMode) {
932                                    runInteractiveMode(bioLemmatizer, outputLemmaOnly, americanize);
933                            } else if (inputStr != null) {
934                                    LemmataEntry lemmata;
935                                    if(americanize) {
936                                            lemmata = bioLemmatizer.lemmatizeByLexiconAndRules(new Americanize().americanize(unicodeHandler(inputStr)), inputStrPos);
937                                    }       
938                                    else
939                                            lemmata = bioLemmatizer.lemmatizeByLexiconAndRules(unicodeHandler(inputStr), inputStrPos);
940                                    if (outputLemmaOnly) {
941                                            System.out.println("The lemma for '" +inputStr+ "' is: " + lemmata.lemmasToString());
942                                    } else {
943                                            System.out.println("The lemma for '" +inputStr+ "' is: " + lemmata);
944                                    }
945                            } else if (inputFile != null) {
946                                    if (outputFile == null) {
947                                            System.err.println("Output file must be set if the input file parameter is used.");
948                                            parser.printUsage(System.err);
949                                    }
950                                    processInputFile(inputFile, outputFile, bioLemmatizer, outputLemmaOnly, americanize);
951                            } else {
952                                    System.err.println("Invalid input parameters...");
953                                    parser.printUsage(System.err);
954                            }
955                    } catch (IOException e) {
956                            throw new RuntimeException(e);
957                    }
958                    System.out.println("=========================================================");
959                    System.out.println("=========================================================");
960                    System.out.println("=========================================================");
961            }
962    
963            /**
964             * @param inputFile
965             * @param outputFile
966             * @param bioLemmatizer
967             * @param outputLemmaOnly
968             * @throws IOException
969             */
970            private static void processInputFile(File inputFile, File outputFile, BioLemmatizer bioLemmatizer,
971                            boolean outputLemmaOnly, boolean americanize) throws IOException {
972                    Americanize convert = null;
973                    if(americanize) 
974                            convert = new Americanize();
975                    BufferedReader input;
976                    BufferedWriter output;
977    
978                    try {
979                            // input = FileReaderUtil.initBufferedReader(inputFile, CharacterEncoding.UTF_8);
980                            input = new BufferedReader(new InputStreamReader(new FileInputStream(inputFile), Charset.forName("UTF-8")
981                                            .newDecoder().onMalformedInput(CodingErrorAction.REPORT)
982                                            .onUnmappableCharacter(CodingErrorAction.REPORT)));
983                    } catch (FileNotFoundException e) {
984                            throw new RuntimeException("Unable to open the input file: " + inputFile.getAbsolutePath(), e);
985                    }
986    
987                    try {
988                            // output = FileWriterUtil.initBufferedWriter(outputFile, CharacterEncoding.UTF_8,
989                            // WriteMode.OVERWRITE, FileSuffixEnforcement.OFF);
990                            output = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outputFile, false), Charset
991                                            .forName("UTF-8").newEncoder().onMalformedInput(CodingErrorAction.REPORT)
992                                            .onUnmappableCharacter(CodingErrorAction.REPORT)));
993                    } catch (FileNotFoundException e) {
994                            throw new RuntimeException("Unable to open the output file: " + outputFile.getAbsolutePath(), e);
995                    }
996    
997                    String line = null;
998    
999                    while ((line = input.readLine()) != null) {
1000                            if (line.trim().length() == 0) {
1001                                    output.write("\n");
1002                                    continue;
1003                            }
1004                            line = line.trim();
1005                            String[] pair = line.split("\t");
1006                            String pos;
1007                            if (pair.length == 1) {
1008                                    pos = "";
1009                            } else {
1010                                    pos = pair[1];
1011                            }
1012                            LemmataEntry lemmata;
1013                            if(americanize)
1014                                lemmata = bioLemmatizer.lemmatizeByLexiconAndRules(convert.americanize(unicodeHandler(pair[0])), pos);
1015                            else
1016                                    lemmata = bioLemmatizer.lemmatizeByLexiconAndRules(unicodeHandler(pair[0]), pos);
1017                            String result;
1018                            if (outputLemmaOnly) {
1019                                    result = line + "\t" + lemmata.lemmasToString() + "\n";
1020                            } else {
1021                                    result = line + "\t" + lemmata + "\n";
1022                            }
1023                            output.write(result);
1024                    }
1025                    // close input
1026                    input.close();
1027                    // close output
1028                    output.close();
1029            }
1030    
1031            private static void runInteractiveMode(BioLemmatizer bioLemmatizer, boolean outputLemmaOnly, boolean americanize) throws IOException {
1032                    Americanize convert = null;
1033                    if(americanize) 
1034                            convert = new Americanize();
1035                    BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
1036                    String input;
1037                    System.out
1038                                    .println("Running BioLemmatizer in interactive mode. Please type a word to be lemmatized with an optional part-of-speech, e.g. \"run\" or \"run NN\"");
1039                    while ((input = in.readLine()) != null && input.length() != 0) {
1040                            String[] arguments = input.split("\\s");
1041                            if (arguments.length > 2) {
1042                                    System.out.println("Only one word to be lemmatized (with or without POS) is allowed");
1043                                    System.exit(0);
1044                            }
1045                            String spelling = arguments[0].trim();
1046                            String partOfSpeech = (arguments.length == 2) ? arguments[1].trim() : null;
1047                            LemmataEntry lemmata; 
1048                            if(americanize) 
1049                                lemmata = bioLemmatizer.lemmatizeByLexiconAndRules(convert.americanize(unicodeHandler(spelling)), partOfSpeech);
1050                            else
1051                                    lemmata = bioLemmatizer.lemmatizeByLexiconAndRules(unicodeHandler(spelling), partOfSpeech);
1052                            if (outputLemmaOnly) {
1053                                    System.out.println(lemmata.lemmasToString());
1054                            } else {
1055                                    System.out.println(lemmata);
1056                            }
1057                    }
1058            }
1059    }
1060    
1061    /** Lemmatizer for English. */
1062    class MorphAdornerLemmatizer extends EnglishLemmatizer {
1063            /** list of detachment rules. */
1064            protected static String rulesFileName = "englishrules.txt";
1065    
1066            /**
1067             * Create an English lemmatizer.
1068             * 
1069             * @throws Exception
1070             *             because the {@link EnglishLemmatizer} constructor throws Exception
1071             * 
1072             */
1073            public MorphAdornerLemmatizer() throws Exception {
1074                    // release the rules of original MorphAdorner Lemmatizer
1075                    rules.clear();
1076                    // load new rules
1077                    try {
1078                            loadRules(BioLemmatizer.class.getResource(rulesFileName), "utf-8");
1079                    } catch (IOException e) {
1080                            throw new RuntimeException("Unable to load English rules file.", e);
1081                    }
1082                    // release the irregularForm file of original MorphAdorner Lemmatizer
1083                    // the irregular English forms are integrated into current Lexicon
1084                    irregularForms.clear();
1085            }
1086    }
1087    
1088    /**
1089     * BioWordLexicon: Biomedical word Lexicon which extends MorphAdorner's English word lexicon.
1090     */
1091    class BioWordLexicon extends DefaultLexicon {
1092            /** Resource path to word lexicon. */
1093            protected static final String lexiconPath = "lexicon.lex";
1094    
1095            /**
1096             * Create an empty lexicon.
1097             * 
1098             * @throws IOException
1099             */
1100            public BioWordLexicon(File lexiconFile) throws IOException {
1101                    // Create empty lexicon.
1102                    super();
1103                    if (lexiconFile == null) {
1104                            // Load default word lexicon.
1105                            loadLexicon(BioLemmatizer.class.getResource(lexiconPath), "utf-8");
1106                    } else {
1107                            loadLexicon(lexiconFile.toURI().toURL(), "utf-8");
1108                    }
1109            }
1110    }
1111    
1112    /** POSEntry: store different POS tags and the corresponding tagset label */
1113    class POSEntry {
1114            public Map<String, String> tagToTagSet;
1115    
1116            /**
1117             * Construtor to initialize the class field by loading different POS tagsets
1118             */
1119            public POSEntry() {
1120                    tagToTagSet = new HashMap<String, String>();
1121                    // NUPOS tags
1122                    Lexicon wordLexicon;
1123                    try {
1124                            wordLexicon = new DefaultWordLexicon();
1125                    } catch (Exception e) {
1126                            throw new RuntimeException(e);
1127                    }
1128                    addNewTagSet(Arrays.asList(wordLexicon.getCategories()), "NUPOS");
1129    
1130                    // PennPOS tags
1131                    String mappingFileName = "PennPOStoNUPOS.mapping";
1132                    InputStream is = BioLemmatizer.class.getResourceAsStream(mappingFileName);
1133                    Map<String, String[]> mappingPennPOStoNUPOS;
1134                    try {
1135                            mappingPennPOStoNUPOS = BioLemmatizer.loadPOSMappingFile(is);
1136                    } catch (IOException e) {
1137                            throw new RuntimeException("Error while opening mapping file: " + mappingFileName, e);
1138                    }
1139                    addNewTagSet(mappingPennPOStoNUPOS.keySet(), "PennPOS");
1140            }
1141    
1142            /**
1143             * Add new POS tagset
1144             * 
1145             * @param tags
1146             *            a set of POS tags
1147             * @param tagSetLabel
1148             *            the corresponding tagset label
1149             */
1150            public void addNewTagSet(Collection<String> tags, String tagSetLabel) {
1151                    for (String tag : tags) {
1152                            tagToTagSet.put(tag, tagSetLabel);
1153                    }
1154            }
1155    
1156            /**
1157             * Retrieve the tagset label of the input POS tag
1158             * 
1159             * @param category
1160             *            an input POS tag
1161             * @return the corresponding POS tagset label
1162             */
1163            public String getTagSetLabel(String category) {
1164                    String defaultLabel = "NONE";
1165                    return tagToTagSet.containsKey(category) ? tagToTagSet.get(category) : defaultLabel;
1166            }
1167    }