001 /*
002 Copyright (c) 2012, Regents of the University of Colorado
003 All rights reserved.
004
005 Redistribution and use in source and binary forms, with or without modification,
006 are permitted provided that the following conditions are met:
007
008 * Redistributions of source code must retain the above copyright notice, this
009 list of conditions and the following disclaimer.
010
011 * Redistributions in binary form must reproduce the above copyright notice,
012 this list of conditions and the following disclaimer in the documentation
013 and/or other materials provided with the distribution.
014
015 * Neither the name of the University of Colorado nor the names of its
016 contributors may be used to endorse or promote products derived from this
017 software without specific prior written permission.
018
019 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
023 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030
031 package edu.ucdenver.ccp.nlp.biolemmatizer;
032
033 import java.io.BufferedReader;
034 import java.io.BufferedWriter;
035 import java.io.File;
036 import java.io.FileInputStream;
037 import java.io.FileNotFoundException;
038 import java.io.FileOutputStream;
039 import java.io.IOException;
040 import java.io.InputStream;
041 import java.io.InputStreamReader;
042 import java.io.OutputStreamWriter;
043 import java.nio.charset.Charset;
044 import java.nio.charset.CodingErrorAction;
045 import java.util.ArrayList;
046 import java.util.Arrays;
047 import java.util.Collection;
048 import java.util.HashMap;
049 import java.util.HashSet;
050 import java.util.List;
051 import java.util.Map;
052 import java.util.Set;
053
054 import org.kohsuke.args4j.CmdLineException;
055 import org.kohsuke.args4j.CmdLineParser;
056
057 import edu.northwestern.at.utils.corpuslinguistics.lemmatizer.EnglishLemmatizer;
058 import edu.northwestern.at.utils.corpuslinguistics.lemmatizer.Lemmatizer;
059 import edu.northwestern.at.utils.corpuslinguistics.lexicon.DefaultLexicon;
060 import edu.northwestern.at.utils.corpuslinguistics.lexicon.DefaultWordLexicon;
061 import edu.northwestern.at.utils.corpuslinguistics.lexicon.Lexicon;
062 import edu.northwestern.at.utils.corpuslinguistics.lexicon.LexiconEntry;
063 import edu.northwestern.at.utils.corpuslinguistics.partsofspeech.PartOfSpeechTags;
064 import edu.northwestern.at.utils.corpuslinguistics.tokenizer.PennTreebankTokenizer;
065 import edu.northwestern.at.utils.corpuslinguistics.tokenizer.WordTokenizer;
066
067 /**
068 * BioLemmatizer: Lemmatize a word in biomedical texts and return its lemma; the part of speech
069 * (POS) of the word is optional.
070 *
071 * <p>
072 * Usage:
073 * </p>
074 *
075 * <p>
076 * <code>
077 * java -Xmx1G -jar biolemmatizer-core-1.0-jar-with-dependencies.jar [-l] {@literal <input_string>} [POS tag] or<br>
078 * java -Xmx1G -jar biolemmatizer-core-1.0-jar-with-dependencies.jar [-l] -i {@literal <input_file_name> -o <output_file_name>} or<br>
079 * java -Xmx1G -jar biolemmatizer-core-1.0-jar-with-dependencies.jar [-l] -t<br>
080 * </code>
081 * </p>
082 *
083 * <p>
084 * Example:
085 * </p>
086 *
087 * <p>
088 * <code>
089 * java -Xmx1G -jar biolemmatizer-core-1.0-jar-with-dependencies.jar catalyses NNS
090 * </code>
091 * </p>
092 *
093 * <p>
094 * Please see the README file for more usage examples
095 * </p>
096 *
097 * @author Haibin Liu <Haibin.Liu@ucdenver.edu>, William A Baumgartner Jr
098 * <William.Baumgartner@ucdenver.edu> and Karin Verspoor <Karin.Verspoor@ucdenver.edu>
099 */
100
101 public class BioLemmatizer {
102 /** Lemma separator character */
103 public static String lemmaSeparator = "||";
104
105 /** BioLemmatizer */
106 public Lemmatizer lemmatizer;
107
108 /** Word lexicon for lemma lookup */
109 public Lexicon wordLexicon;
110
111 /** NUPOS tags */
112 public PartOfSpeechTags partOfSpeechTags;
113
114 /** Extract individual word parts from a contracted word. */
115 public WordTokenizer spellingTokenizer;
116
117 /** Hierachical mapping file from PennPOS to NUPOS */
118 public Map<String, String[]> mappingPennPOStoNUPOS;
119
120 /** Hierachical mapping file from major class to Penn Treebank POS */
121 public Map<String, String[]> mappingMajorClasstoPennPOS;
122
123 /** the Part-Of-Speech mapping file */
124 protected static String mappingFileName;
125
126 /** POSEntry object to retrieve POS tag information */
127 public POSEntry posEntry;
128
129 /**
130 * Default constructor loads the lexicon from the classpath
131 */
132 public BioLemmatizer() {
133 this(null);
134 }
135
136 /**
137 * Constructor to initialize the class fields
138 *
139 * @param lexiconFile
140 * a reference to the lexicon file to use. If null, the lexicon that comes with the
141 * BioLemmatizer distribution is loaded from the classpath
142 */
143 public BioLemmatizer(File lexiconFile) {
144
145 // Get the default rule-based lemmatizer.
146 try {
147 lemmatizer = new MorphAdornerLemmatizer();
148 } catch (Exception e) {
149 throw new RuntimeException(e);
150 }
151 // Get default word lexicon.
152 try {
153 wordLexicon = new BioWordLexicon(lexiconFile);
154 } catch (IOException e) {
155 throw new RuntimeException(e);
156 }
157 // Get the part of speech tags from the word lexicon.
158 partOfSpeechTags = wordLexicon.getPartOfSpeechTags();
159 // Get spelling tokenizer.
160 spellingTokenizer = new PennTreebankTokenizer();
161 // Set the lexicon which may provide lemmata.
162 lemmatizer.setLexicon(wordLexicon);
163 // Set the dictionary for checking lemmata after applying lemmatization
164 // rules.
165 lemmatizer.setDictionary(setDictionary(wordLexicon));
166
167 // Specify the Part-Of-Speech mapping files
168 mappingFileName = "PennPOStoNUPOS.mapping";
169 InputStream is = BioLemmatizer.class.getResourceAsStream(mappingFileName);
170 try {
171 mappingPennPOStoNUPOS = loadPOSMappingFile(is);
172 } catch (IOException e) {
173 throw new RuntimeException("Unable to load mapping: " + mappingFileName, e);
174 }
175
176 mappingFileName = "MajorClasstoPennPOS.mapping";
177 is = BioLemmatizer.class.getResourceAsStream(mappingFileName);
178 try {
179 mappingMajorClasstoPennPOS = loadPOSMappingFile(is);
180 } catch (IOException e) {
181 throw new RuntimeException("Unable to load mapping: " + mappingFileName, e);
182 }
183
184 // Get the POS tagsets
185 posEntry = new POSEntry();
186 }
187
188 /**
189 * Static method to load a Part-Of-Speech mapping file
190 *
191 * @param is
192 * InputStream of the mapping file
193 * @return a Map object that stores the hierachical mapping information in the file
194 * @throws IOException
195 */
196 static Map<String, String[]> loadPOSMappingFile(InputStream is) throws IOException {
197 Map<String, String[]> mapping = new HashMap<String, String[]>();
198
199 try {
200 InputStreamReader isr = new InputStreamReader(is);
201 BufferedReader input = new BufferedReader(isr);
202
203 String line = null;
204 while ((line = input.readLine()) != null) {
205 line = line.trim();
206 String[] pair = line.split("\t");
207 String[] mappingSet = pair[1].split(",");
208 mapping.put(pair[0], mappingSet);
209 // remove the first empty char with unicode FEFF
210 mapping.put(pair[0].replaceAll("^\\uFEFF", ""), mappingSet);
211 }
212
213 input.close();
214 isr.close();
215 } finally {
216 is.close();
217 }
218 return mapping;
219 }
220
221 /**
222 * Create a dictionary from a word lexicon for validating lemmata resulted from lemmatization
223 * rules
224 *
225 * @param wordLexicon
226 * a word lexicon
227 * @return a set that contains a dictionary generated from the word lexicon
228 */
229 private Set<String> setDictionary(Lexicon wordLexicon) {
230 Set<String> dictionarySet = new HashSet<String>();
231
232 // generate dictionary from lexicon
233 String[] lexiconEntries = wordLexicon.getEntries();
234 for (String entry : lexiconEntries) {
235 String[] lemmata = wordLexicon.getLemmata(entry);
236 for (String lemma : lemmata) {
237 dictionarySet.add(lemma.toLowerCase());
238 }
239 }
240
241 return dictionarySet;
242 }
243
244 /**
245 * Retrieve an array of corresponding NUPOS tags of a Penn Treebank POS tag
246 *
247 * @param partOfSpeech
248 * a POS tag
249 * @return an array of corresponding NUPOS tags;
250 */
251 private String[] getNUPOSTagFromPennPOS(String partOfSpeech) {
252 String[] nuPOSTag = mappingPennPOStoNUPOS.get(partOfSpeech.toUpperCase());
253 return nuPOSTag != null ? nuPOSTag : new String[] { partOfSpeech };
254 }
255
256 /**
257 * Retrieve an array of corresponding Penn Treebank POS tags of a NUPOS tag
258 *
259 * @param partOfSpeech
260 * a POS tag
261 * @return an array of corresponding Penn Treebank POS tags;
262 */
263 private String[] getPennPOSFromNUPOS(String partOfSpeech) {
264 List<String> result = new ArrayList<String>();
265 for (String key : mappingPennPOStoNUPOS.keySet()) {
266 for (String value : mappingPennPOStoNUPOS.get(key)) {
267 if (value.equals(partOfSpeech)) {
268 result.add(key);
269 break;
270 }
271 }
272 }
273 return result.size() != 0 ? result.toArray(new String[result.size()]) : new String[] { partOfSpeech };
274 }
275
276 /**
277 * Retrieve sibling Penn Treebank POS tags of a Penn Treebank POS tag from the POS hierarchy
278 *
279 * @param partOfSpeech
280 * a Penn Treebank POS tag
281 * @return sibling Penn Treebank POS tags of the Penn Treebank POS tag
282 */
283 private String[] getSiblingPennPOSTag(String partOfSpeech) {
284 // check if partOfSpeech exists in the hierarchy
285 boolean globalFlag = false;
286 for (String key : mappingMajorClasstoPennPOS.keySet()) {
287 String[] posTag = mappingMajorClasstoPennPOS.get(key);
288 for (String pos : posTag) {
289 if (pos.equals(partOfSpeech)) {
290 globalFlag = true;
291 break;
292 }
293 }
294 if (globalFlag)
295 break;
296 }
297
298 if (globalFlag) {
299 String foundKey = "";
300 for (String key : mappingMajorClasstoPennPOS.keySet()) {
301 String[] posTag = mappingMajorClasstoPennPOS.get(key);
302 boolean localFlag = false;
303 for (String pos : posTag) {
304 if (pos.equals(partOfSpeech)) {
305 foundKey = key;
306 localFlag = true;
307 break;
308 }
309 }
310 if (localFlag)
311 break;
312 }
313 List<String> merge = new ArrayList<String>();
314 for (String pos : mappingMajorClasstoPennPOS.get(foundKey)) {
315 if (!pos.equals(partOfSpeech))
316 merge.add(pos);
317 }
318 return merge.toArray(new String[merge.size()]);
319 } else {
320 return new String[] { partOfSpeech };
321 }
322 }
323
324 /**
325 * Retrieve sibling NUPOS tags of a Penn Treebank POS tag from the POS hierarchy
326 *
327 * @param partOfSpeech
328 * a Penn Treebank POS tag
329 * @return sibling NUPOS tags of the Penn Treebank POS tag
330 */
331 private String[] getSiblingNUPOSTag(String partOfSpeech) {
332 // check if partOfSpeech exists in the hierarchy
333 boolean globalFlag = false;
334 for (String key : mappingMajorClasstoPennPOS.keySet()) {
335 String[] posTag = mappingMajorClasstoPennPOS.get(key);
336 for (String pos : posTag) {
337 if (pos.equals(partOfSpeech)) {
338 globalFlag = true;
339 break;
340 }
341 }
342 if (globalFlag)
343 break;
344 }
345
346 if (globalFlag) {
347 String foundKey = "";
348 for (String key : mappingMajorClasstoPennPOS.keySet()) {
349 String[] posTag = mappingMajorClasstoPennPOS.get(key);
350 boolean localFlag = false;
351 for (String pos : posTag) {
352 if (pos.equals(partOfSpeech)) {
353 foundKey = key;
354 localFlag = true;
355 break;
356 }
357 }
358 if (localFlag)
359 break;
360 }
361 List<String> merge = new ArrayList<String>();
362 for (String pos : mappingMajorClasstoPennPOS.get(foundKey)) {
363 if (!pos.equals(partOfSpeech)) {
364 merge.addAll(Arrays.asList(mappingPennPOStoNUPOS.get(pos.toUpperCase())));
365 }
366 }
367
368 return merge.toArray(new String[merge.size()]);
369 } else {
370 return new String[] { partOfSpeech };
371 }
372 }
373
374 /**
375 * Retrieve lemmas and the corresponding categories of the input string
376 *
377 * @param spelling
378 * an input string
379 * @return a Map object that stores lemmas and categories of the string; key: category, value:
380 * lemma
381 */
382 private Map<String, String> getLemmasAndCategories(String spelling) {
383 Map<String, String> lemmasAndCategories = new HashMap<String, String>();
384 LexiconEntry lexiconEntry = wordLexicon.getLexiconEntry(spelling);
385 if (lexiconEntry != null)
386 lemmasAndCategories = lexiconEntry.lemmata;
387 return lemmasAndCategories;
388 }
389
390 /**
391 * Clean up the raw lemma resulted from lemmatization rules
392 *
393 * @param lemma
394 * a raw lemma
395 * @return clean lemma
396 */
397 private static String cleanUpLemma(String lemma) {
398 String newLemma = lemma;
399 String lastChar = lemma.substring(lemma.length() - 1);
400 if (lastChar.equals("'")) {
401 newLemma = lemma.substring(0, lemma.length() - 1);
402 }
403 return newLemma;
404 }
405
406 /**
407 * Convert special unicode characters into modern English spelling
408 *
409 * @param input
410 * an input string
411 * @return modern English spelling
412 */
413 static String unicodeHandler(String input) {
414 // define the mapping between special unicode characters and modern
415 // English spelling
416 Map<String, String> specialUnicodeCharToModernEnglishMapping = new HashMap<String, String>();
417
418 specialUnicodeCharToModernEnglishMapping.put("u00E6", "ae");
419 specialUnicodeCharToModernEnglishMapping.put("u0153", "oe");
420 specialUnicodeCharToModernEnglishMapping.put("u00E4", "a");
421 specialUnicodeCharToModernEnglishMapping.put("u00E0", "a");
422 specialUnicodeCharToModernEnglishMapping.put("u00E1", "a");
423 specialUnicodeCharToModernEnglishMapping.put("u0113", "e");
424 specialUnicodeCharToModernEnglishMapping.put("u00E9", "e");
425 specialUnicodeCharToModernEnglishMapping.put("u00E8", "e");
426 specialUnicodeCharToModernEnglishMapping.put("u00EB", "e");
427 specialUnicodeCharToModernEnglishMapping.put("u00EF", "i");
428 specialUnicodeCharToModernEnglishMapping.put("u00F1", "n");
429 specialUnicodeCharToModernEnglishMapping.put("u014D", "o");
430 specialUnicodeCharToModernEnglishMapping.put("u00F6", "o");
431 specialUnicodeCharToModernEnglishMapping.put("u00F4", "o");
432 specialUnicodeCharToModernEnglishMapping.put("u016B", "u");
433 specialUnicodeCharToModernEnglishMapping.put("u00FA", "u");
434
435 String output = input;
436 for (String unicode : specialUnicodeCharToModernEnglishMapping.keySet()) {
437 String regex = "\\" + unicode;
438 output = output.replaceAll(regex, specialUnicodeCharToModernEnglishMapping.get(unicode));
439 }
440
441 return output;
442 }
443
444 /**
445 * Lemmatize a string with POS tag using Lexicon only
446 *
447 * @param spelling
448 * an input string
449 * @param partOfSpeech
450 * POS tag of the input string
451 * @return a LemmaEntry object containing lemma and POS information
452 */
453 public LemmataEntry lemmatizeByLexicon(String spelling, String partOfSpeech) {
454 Map<String, String> lemmataAndLemmataTag = new HashMap<String, String>();
455 String lemmata = spelling;
456 String lemmataTag;
457 if (partOfSpeech == null)
458 partOfSpeech = "";
459 // default POS tag = NONE
460 if (partOfSpeech.trim().length() == 0)
461 lemmataTag = "NONE";
462 else
463 lemmataTag = partOfSpeech;
464
465 // check the POS tagset
466 String tagSetLabel = posEntry.getTagSetLabel(partOfSpeech);
467
468 String[] nuPOSTag = getNUPOSTagFromPennPOS(partOfSpeech);
469
470 // Different lexicon search methods are tried in order to
471 // augument the use of
472 // lexicon
473 String lemma = "*";
474 String category = "*";
475
476 if (tagSetLabel.equals("PennPOS")) {
477 // direct PennPOS tag search
478 lemma = wordLexicon.getLemma(spelling.toLowerCase(), partOfSpeech);
479 if (lemma.equals("*")) {
480 lemma = wordLexicon.getLemma(spelling.toUpperCase(), partOfSpeech);
481 }
482 if (!lemma.equals("*")) {
483 lemmata = lemma;
484 category = partOfSpeech;
485 // System.out.println("found in the Penn direct lexicon: "+lemma);
486 }
487 // PennPOS tag hierachical search
488 if (lemma.equals("*")) {
489 String[] hierarachicalPennPOSTag = getSiblingPennPOSTag(partOfSpeech);
490 for (String pos : hierarachicalPennPOSTag) {
491 lemma = wordLexicon.getLemma(spelling.toLowerCase(), pos);
492 if (lemma.equals("*")) {
493 lemma = wordLexicon.getLemma(spelling.toUpperCase(), pos);
494 }
495 if (!lemma.equals("*")) {
496 lemmata = lemma;
497 category = pos;
498 // System.out.println("found in the Penn hierachical lexicon: "+lemma);
499 break;
500 }
501 }
502 }
503 // Turn PennPOS tag into NUSPOS tag and search
504 if (lemma.equals("*")) {
505 for (String pos : nuPOSTag) {
506 lemma = wordLexicon.getLemma(spelling.toLowerCase(), pos);
507 if (lemma.equals("*")) {
508 lemma = wordLexicon.getLemma(spelling.toUpperCase(), pos);
509 }
510 if (!lemma.equals("*")) {
511 lemmata = lemma;
512 category = pos;
513 // System.out.println("found in the converted NU direct lexicon: "+lemma);
514 break;
515 }
516 }
517 }
518 // NUSPOS tag hierachical search
519 if (lemma.equals("*")) {
520 String[] hierarachicalNUPOSTag = getSiblingNUPOSTag(partOfSpeech);
521 for (String pos : hierarachicalNUPOSTag) {
522 lemma = wordLexicon.getLemma(spelling.toLowerCase(), pos);
523 if (lemma.equals("*")) {
524 lemma = wordLexicon.getLemma(spelling.toUpperCase(), pos);
525 }
526 if (!lemma.equals("*")) {
527 lemmata = lemma;
528 category = pos;
529 // System.out.println("found in the converted NU hierachical lexicon: "+lemma);
530 break;
531 }
532 }
533 }
534 } else if (tagSetLabel.equals("NUPOS")) {
535 // direct NUPOS tag search
536 lemma = wordLexicon.getLemma(spelling.toLowerCase(), partOfSpeech);
537 if (lemma.equals("*")) {
538 lemma = wordLexicon.getLemma(spelling.toUpperCase(), partOfSpeech);
539 }
540 if (!lemma.equals("*")) {
541 lemmata = lemma;
542 category = partOfSpeech;
543 // System.out.println("found in the NU direct lexicon: "+lemma);
544 }
545 // NUPOS tag hierachical search
546 if (lemma.equals("*")) {
547 String[] hierarachicalNUPOSTag = getSiblingNUPOSTag(getPennPOSFromNUPOS(partOfSpeech)[0]);
548 for (String pos : hierarachicalNUPOSTag) {
549 lemma = wordLexicon.getLemma(spelling.toLowerCase(), pos);
550 if (lemma.equals("*")) {
551 lemma = wordLexicon.getLemma(spelling.toUpperCase(), pos);
552 }
553 if (!lemma.equals("*")) {
554 lemmata = lemma;
555 category = pos;
556 // System.out.println("found in the NU hierachical lexicon: "+lemma);
557 break;
558 }
559 }
560 }
561 }
562
563 // backup lexicon lookup process: search without POS tags, return all
564 // lemmas
565 Map<String, String> lemmasAndCategories = new HashMap<String, String>();
566 if (tagSetLabel.equals("NONE") || lemma.equals("*")) {
567 // if ( tagSetLabel.equals("NONE") ) {
568 lemmasAndCategories = getLemmasAndCategories(spelling.toLowerCase());
569 if (lemmasAndCategories.isEmpty()) {
570 lemmasAndCategories = getLemmasAndCategories(spelling.toUpperCase());
571 }
572 }
573
574 // found the Lemma
575 if (!lemmasAndCategories.isEmpty()) {
576 lemmataAndLemmataTag = lemmasAndCategories;
577 // System.out.println("found in the lexicon");
578 } else if (!lemma.equals("*")) {
579 lemmata = lemma;
580 lemmataTag = category;
581 lemmataAndLemmataTag.put(lemmataTag, lemmata);
582 }
583 // lexicon has been checked but nothing found, return original input
584 else
585 lemmataAndLemmataTag.put(lemmataTag, lemmata);
586
587 return new LemmataEntry(lemmataAndLemmataTag, posEntry);
588 }
589
590 /**
591 * Lemmatize a string with POS tag using lemmatization rules only
592 *
593 * @param spelling
594 * an input string
595 * @param partOfSpeech
596 * POS tag of the input string
597 * @return a LemmaEntry object containing lemma and POS information
598 */
599 public LemmataEntry lemmatizeByRules(String spelling, String partOfSpeech) {
600 // option to have a dictionary for rule-based lemmatizer to validate results
601 // lemmatizer.setDictionary(new HashSet<String>());
602
603 Map<String, String> lemmataAndLemmataTag = new HashMap<String, String>();
604 String lemmata = spelling;
605 String lemmataTag;
606 // default POS tag = NONE
607 if (partOfSpeech == null)
608 partOfSpeech = "";
609 if (partOfSpeech.trim().length() == 0)
610 lemmataTag = "NONE";
611 else
612 lemmataTag = partOfSpeech;
613
614 String[] nuPOSTag = getNUPOSTagFromPennPOS(partOfSpeech);
615
616 // Use rule-based lemmatizer.
617
618 // Get lemmatization word class for part of speech,
619 String lemmaClass = "";
620 for (String pos : nuPOSTag) {
621 lemmaClass = partOfSpeechTags.getLemmaWordClass(pos);
622 if (lemmaClass.length() != 0) {
623 break;
624 }
625 }
626
627 // Do not lemmatize words which should not be lemmatized, ?including
628 // proper names?.
629 if (lemmatizer.cantLemmatize(spelling) || lemmaClass.equals("none")) {
630 } else {
631 // Try compound word exceptions list first.
632 lemmata = lemmatizer.lemmatize(spelling, "compound");
633
634 // If lemma not found, keep trying.
635 if (lemmata.equalsIgnoreCase(spelling)) {
636 // Extract individual word parts.
637 // May be more than one for a
638 // contraction.
639 List<String> wordList = spellingTokenizer.extractWords(spelling);
640
641 // If just one word part, get its lemma.
642 if (!partOfSpeechTags.isCompoundTag(partOfSpeech) || (wordList.size() == 1)) {
643 if (lemmaClass.length() == 0) {
644 lemmata = lemmatizer.lemmatize(spelling);
645 } else {
646 lemmata = lemmatizer.lemmatize(spelling, lemmaClass);
647 }
648 }
649 // More than one word part.
650 // Get lemma for each part and
651 // concatenate them with the
652 // lemma separator to form a
653 // compound lemma.
654 else {
655 lemmata = "";
656 String[] posTags = partOfSpeechTags.splitTag(partOfSpeech);
657
658 if (posTags.length == wordList.size()) {
659 for (int i = 0; i < wordList.size(); i++) {
660 String wordPiece = wordList.get(i);
661 if (i > 0) {
662 lemmata = lemmata + lemmaSeparator;
663 }
664
665 LemmataEntry lemmaPiece = lemmatizeByRules(wordPiece, posTags[i]);
666
667 lemmata = lemmata + lemmaPiece.lemmasToString();
668 }
669 }
670 }
671 }
672 }
673
674 lemmataAndLemmataTag.put(lemmataTag, lemmata);
675
676 return new LemmataEntry(lemmataAndLemmataTag, posEntry);
677
678 }
679
680 /**
681 * Lemmatize a string with POS tag using both lexicon lookup and lemmatization rules This is the
682 * preferred method as it gives the best lemmatization performance
683 *
684 * @param spelling
685 * an input string
686 * @param partOfSpeech
687 * POS tag of the input string
688 * @return a LemmaEntry object containing lemma and POS information
689 */
690 public LemmataEntry lemmatizeByLexiconAndRules(String spelling, String partOfSpeech) {
691
692 Map<String, String> lemmataAndLemmataTag = new HashMap<String, String>();
693 String lemmata = spelling;
694 String lemmataTag;
695 // default POS tag = NONE
696 if (partOfSpeech == null)
697 partOfSpeech = "";
698 if (partOfSpeech.trim().length() == 0)
699 lemmataTag = "NONE";
700 else
701 lemmataTag = partOfSpeech;
702
703 // check the POS tagset
704 String tagSetLabel = posEntry.getTagSetLabel(partOfSpeech);
705
706 String[] nuPOSTag = getNUPOSTagFromPennPOS(partOfSpeech);
707
708 // Try lexicon first, different search methods are tried in order to
709 // augument the use of
710 // lexicon
711 String lemma = "*";
712 String category = "*";
713
714 if (tagSetLabel.equals("PennPOS")) {
715 // direct PennPOS tag search
716 lemma = wordLexicon.getLemma(spelling.toLowerCase(), partOfSpeech);
717 if (lemma.equals("*")) {
718 lemma = wordLexicon.getLemma(spelling.toUpperCase(), partOfSpeech);
719 }
720 if (!lemma.equals("*")) {
721 category = partOfSpeech;
722 // System.out.println("found in the Penn direct lexicon: "+lemma);
723 }
724 // PennPOS tag hierachical search
725 if (lemma.equals("*")) {
726 String[] hierarachicalPennPOSTag = getSiblingPennPOSTag(partOfSpeech);
727 for (String pos : hierarachicalPennPOSTag) {
728 lemma = wordLexicon.getLemma(spelling.toLowerCase(), pos);
729 if (lemma.equals("*")) {
730 lemma = wordLexicon.getLemma(spelling.toUpperCase(), pos);
731 }
732 if (!lemma.equals("*")) {
733 category = pos;
734 // System.out.println("found in the Penn hierachical lexicon: "+lemma);
735 break;
736 }
737 }
738 }
739 // Turn PennPOS tag into NUSPOS tag and search
740 if (lemma.equals("*")) {
741 for (String pos : nuPOSTag) {
742 lemma = wordLexicon.getLemma(spelling.toLowerCase(), pos);
743 if (lemma.equals("*")) {
744 lemma = wordLexicon.getLemma(spelling.toUpperCase(), pos);
745 }
746 if (!lemma.equals("*")) {
747 category = pos;
748 // System.out.println("found in the converted NU direct lexicon: "+lemma);
749 break;
750 }
751 }
752 }
753 // NUSPOS tag hierachical search
754 if (lemma.equals("*")) {
755 String[] hierarachicalNUPOSTag = getSiblingNUPOSTag(partOfSpeech);
756 for (String pos : hierarachicalNUPOSTag) {
757 lemma = wordLexicon.getLemma(spelling.toLowerCase(), pos);
758 if (lemma.equals("*")) {
759 lemma = wordLexicon.getLemma(spelling.toUpperCase(), pos);
760 }
761 if (!lemma.equals("*")) {
762 category = pos;
763 // System.out.println("found in the converted NU hierachical lexicon: "+lemma);
764 break;
765 }
766 }
767 }
768 } else if (tagSetLabel.equals("NUPOS")) {
769 // direct NUPOS tag search
770 lemma = wordLexicon.getLemma(spelling.toLowerCase(), partOfSpeech);
771 if (lemma.equals("*")) {
772 lemma = wordLexicon.getLemma(spelling.toUpperCase(), partOfSpeech);
773 }
774 if (!lemma.equals("*")) {
775 category = partOfSpeech;
776 // System.out.println("found in the NU direct lexicon: "+lemma);
777 }
778 // NUPOS tag hierachical search
779 if (lemma.equals("*")) {
780 String[] hierarachicalNUPOSTag = getSiblingNUPOSTag(getPennPOSFromNUPOS(partOfSpeech)[0]);
781 for (String pos : hierarachicalNUPOSTag) {
782 lemma = wordLexicon.getLemma(spelling.toLowerCase(), pos);
783 if (lemma.equals("*")) {
784 lemma = wordLexicon.getLemma(spelling.toUpperCase(), pos);
785 }
786 if (!lemma.equals("*")) {
787 category = pos;
788 // System.out.println("found in the NU hierachical lexicon: "+lemma);
789 break;
790 }
791 }
792 }
793 }
794
795 // if tagSetLabel is NONE, invoke the backup lexicon lookup process:
796 // search without POS tags, return all lemmas
797 Map<String, String> lemmasAndCategories = new HashMap<String, String>();
798 // if ( tagSetLabel.equals("NONE") || lemma.equals( "*" ) ) {
799 if (tagSetLabel.equals("NONE")) {
800 lemmasAndCategories = getLemmasAndCategories(spelling.toLowerCase());
801 if (lemmasAndCategories.isEmpty()) {
802 lemmasAndCategories = getLemmasAndCategories(spelling.toUpperCase());
803 }
804 }
805
806 // found the Lemma
807 if (!lemmasAndCategories.isEmpty()) {
808 lemmataAndLemmataTag = lemmasAndCategories;
809 // System.out.println("found in the lexicon");
810 } else if (!lemma.equals("*")) {
811 lemmata = lemma;
812 lemmataTag = category;
813 lemmataAndLemmataTag.put(lemmataTag, lemmata);
814 }
815 // for testing purpose to test lexicon only
816 // else lemmataAndLemmataTag.put(lemmataTag, lemmata) ;
817
818 // Lemma not found in word lexicon. Use rule-based lemmatizer.
819 else {
820
821 // Get lemmatization word class for part of speech,
822 String lemmaClass = "";
823 for (String pos : nuPOSTag) {
824 lemmaClass = partOfSpeechTags.getLemmaWordClass(pos);
825 if (lemmaClass.length() != 0) {
826 break;
827 }
828 }
829
830 // Do not lemmatize words which should not be lemmatized, ?including
831 // proper names?.
832 if (lemmatizer.cantLemmatize(spelling) || lemmaClass.equals("none")) {
833 } else {
834 // Try compound word exceptions list first.
835 lemmata = lemmatizer.lemmatize(spelling, "compound");
836
837 // If lemma not found, keep trying.
838 if (lemmata.equalsIgnoreCase(spelling)) {
839 // Extract individual word parts.
840 // May be more than one for a
841 // contraction.
842 List<String> wordList = spellingTokenizer.extractWords(spelling);
843
844 // If just one word part, get its lemma.
845 if (!partOfSpeechTags.isCompoundTag(partOfSpeech) || (wordList.size() == 1)) {
846 if (lemmaClass.length() == 0) {
847 lemmata = lemmatizer.lemmatize(spelling);
848 } else {
849 lemmata = lemmatizer.lemmatize(spelling, lemmaClass);
850 }
851 }
852 // More than one word part.
853 // Get lemma for each part and
854 // concatenate them with the
855 // lemma separator to form a
856 // compound lemma.
857 else {
858 lemmata = "";
859 String[] posTags = partOfSpeechTags.splitTag(partOfSpeech);
860
861 if (posTags.length == wordList.size()) {
862 for (int i = 0; i < wordList.size(); i++) {
863 String wordPiece = wordList.get(i);
864 if (i > 0) {
865 lemmata = lemmata + lemmaSeparator;
866 }
867
868 LemmataEntry lemmaPiece = lemmatizeByLexiconAndRules(wordPiece, posTags[i]);
869
870 lemmata = lemmata + lemmaPiece.lemmasToString();
871 }
872 }
873 }
874 }
875 }
876
877 lemmataAndLemmataTag.put(lemmataTag, lemmata);
878 }
879
880 return new LemmataEntry(lemmataAndLemmataTag, posEntry);
881 }
882
883 /**
884 * Input arguments are parsed into a {@link BioLemmatizerCmdOpts} object. Valid input arguments
885 * include:
886 *
887 * <pre>
888 * VAL : Single input to be lemmatized
889 * VAL : Part of speech of the single input to be lemmatized
890 * -f VAL : optional path to a lexicon file. If not set, the default lexicon
891 * available on the classpath is used
892 * -i VAL : the path to the input file
893 * -l : if present, only the lemma is returned (part-of-speech information is
894 * suppressed)
895 * -o VAL : the path to the output file
896 * -t : if present, the interactive mode is used
897 * </pre>
898 *
899 *
900 *
901 * @param args
902 */
903 public static void main(String[] args) {
904
905 BioLemmatizerCmdOpts options = new BioLemmatizerCmdOpts();
906 CmdLineParser parser = new CmdLineParser(options);
907 try {
908 parser.parseArgument(args);
909 } catch (CmdLineException e) {
910 System.err.println(e.getMessage());
911 parser.printUsage(System.err);
912 return;
913 }
914
915 File lexiconFile = options.getLexiconFile();
916 BioLemmatizer bioLemmatizer = new BioLemmatizer(lexiconFile);
917 boolean americanize = options.americanizedLemma();
918 boolean outputLemmaOnly = options.outputLemmaOnly();
919 boolean useInteractiveMode = options.useInteractiveMode();
920 String inputStr = options.getInputStr();
921 if (inputStr != null)
922 inputStr = inputStr.trim();
923 String inputStrPos = options.getInputStrPos();
924 File inputFile = options.getInputFile();
925 File outputFile = options.getOutputFile();
926 System.out.println("=========================================================");
927 System.out.println("=========================================================");
928 System.out.println("=========================================================");
929 System.out.println("Running BioLemmatizer....");
930 try {
931 if (useInteractiveMode) {
932 runInteractiveMode(bioLemmatizer, outputLemmaOnly, americanize);
933 } else if (inputStr != null) {
934 LemmataEntry lemmata;
935 if(americanize) {
936 lemmata = bioLemmatizer.lemmatizeByLexiconAndRules(new Americanize().americanize(unicodeHandler(inputStr)), inputStrPos);
937 }
938 else
939 lemmata = bioLemmatizer.lemmatizeByLexiconAndRules(unicodeHandler(inputStr), inputStrPos);
940 if (outputLemmaOnly) {
941 System.out.println("The lemma for '" +inputStr+ "' is: " + lemmata.lemmasToString());
942 } else {
943 System.out.println("The lemma for '" +inputStr+ "' is: " + lemmata);
944 }
945 } else if (inputFile != null) {
946 if (outputFile == null) {
947 System.err.println("Output file must be set if the input file parameter is used.");
948 parser.printUsage(System.err);
949 }
950 processInputFile(inputFile, outputFile, bioLemmatizer, outputLemmaOnly, americanize);
951 } else {
952 System.err.println("Invalid input parameters...");
953 parser.printUsage(System.err);
954 }
955 } catch (IOException e) {
956 throw new RuntimeException(e);
957 }
958 System.out.println("=========================================================");
959 System.out.println("=========================================================");
960 System.out.println("=========================================================");
961 }
962
963 /**
964 * @param inputFile
965 * @param outputFile
966 * @param bioLemmatizer
967 * @param outputLemmaOnly
968 * @throws IOException
969 */
970 private static void processInputFile(File inputFile, File outputFile, BioLemmatizer bioLemmatizer,
971 boolean outputLemmaOnly, boolean americanize) throws IOException {
972 Americanize convert = null;
973 if(americanize)
974 convert = new Americanize();
975 BufferedReader input;
976 BufferedWriter output;
977
978 try {
979 // input = FileReaderUtil.initBufferedReader(inputFile, CharacterEncoding.UTF_8);
980 input = new BufferedReader(new InputStreamReader(new FileInputStream(inputFile), Charset.forName("UTF-8")
981 .newDecoder().onMalformedInput(CodingErrorAction.REPORT)
982 .onUnmappableCharacter(CodingErrorAction.REPORT)));
983 } catch (FileNotFoundException e) {
984 throw new RuntimeException("Unable to open the input file: " + inputFile.getAbsolutePath(), e);
985 }
986
987 try {
988 // output = FileWriterUtil.initBufferedWriter(outputFile, CharacterEncoding.UTF_8,
989 // WriteMode.OVERWRITE, FileSuffixEnforcement.OFF);
990 output = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outputFile, false), Charset
991 .forName("UTF-8").newEncoder().onMalformedInput(CodingErrorAction.REPORT)
992 .onUnmappableCharacter(CodingErrorAction.REPORT)));
993 } catch (FileNotFoundException e) {
994 throw new RuntimeException("Unable to open the output file: " + outputFile.getAbsolutePath(), e);
995 }
996
997 String line = null;
998
999 while ((line = input.readLine()) != null) {
1000 if (line.trim().length() == 0) {
1001 output.write("\n");
1002 continue;
1003 }
1004 line = line.trim();
1005 String[] pair = line.split("\t");
1006 String pos;
1007 if (pair.length == 1) {
1008 pos = "";
1009 } else {
1010 pos = pair[1];
1011 }
1012 LemmataEntry lemmata;
1013 if(americanize)
1014 lemmata = bioLemmatizer.lemmatizeByLexiconAndRules(convert.americanize(unicodeHandler(pair[0])), pos);
1015 else
1016 lemmata = bioLemmatizer.lemmatizeByLexiconAndRules(unicodeHandler(pair[0]), pos);
1017 String result;
1018 if (outputLemmaOnly) {
1019 result = line + "\t" + lemmata.lemmasToString() + "\n";
1020 } else {
1021 result = line + "\t" + lemmata + "\n";
1022 }
1023 output.write(result);
1024 }
1025 // close input
1026 input.close();
1027 // close output
1028 output.close();
1029 }
1030
1031 private static void runInteractiveMode(BioLemmatizer bioLemmatizer, boolean outputLemmaOnly, boolean americanize) throws IOException {
1032 Americanize convert = null;
1033 if(americanize)
1034 convert = new Americanize();
1035 BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
1036 String input;
1037 System.out
1038 .println("Running BioLemmatizer in interactive mode. Please type a word to be lemmatized with an optional part-of-speech, e.g. \"run\" or \"run NN\"");
1039 while ((input = in.readLine()) != null && input.length() != 0) {
1040 String[] arguments = input.split("\\s");
1041 if (arguments.length > 2) {
1042 System.out.println("Only one word to be lemmatized (with or without POS) is allowed");
1043 System.exit(0);
1044 }
1045 String spelling = arguments[0].trim();
1046 String partOfSpeech = (arguments.length == 2) ? arguments[1].trim() : null;
1047 LemmataEntry lemmata;
1048 if(americanize)
1049 lemmata = bioLemmatizer.lemmatizeByLexiconAndRules(convert.americanize(unicodeHandler(spelling)), partOfSpeech);
1050 else
1051 lemmata = bioLemmatizer.lemmatizeByLexiconAndRules(unicodeHandler(spelling), partOfSpeech);
1052 if (outputLemmaOnly) {
1053 System.out.println(lemmata.lemmasToString());
1054 } else {
1055 System.out.println(lemmata);
1056 }
1057 }
1058 }
1059 }
1060
1061 /** Lemmatizer for English. */
1062 class MorphAdornerLemmatizer extends EnglishLemmatizer {
1063 /** list of detachment rules. */
1064 protected static String rulesFileName = "englishrules.txt";
1065
1066 /**
1067 * Create an English lemmatizer.
1068 *
1069 * @throws Exception
1070 * because the {@link EnglishLemmatizer} constructor throws Exception
1071 *
1072 */
1073 public MorphAdornerLemmatizer() throws Exception {
1074 // release the rules of original MorphAdorner Lemmatizer
1075 rules.clear();
1076 // load new rules
1077 try {
1078 loadRules(BioLemmatizer.class.getResource(rulesFileName), "utf-8");
1079 } catch (IOException e) {
1080 throw new RuntimeException("Unable to load English rules file.", e);
1081 }
1082 // release the irregularForm file of original MorphAdorner Lemmatizer
1083 // the irregular English forms are integrated into current Lexicon
1084 irregularForms.clear();
1085 }
1086 }
1087
1088 /**
1089 * BioWordLexicon: Biomedical word Lexicon which extends MorphAdorner's English word lexicon.
1090 */
1091 class BioWordLexicon extends DefaultLexicon {
1092 /** Resource path to word lexicon. */
1093 protected static final String lexiconPath = "lexicon.lex";
1094
1095 /**
1096 * Create an empty lexicon.
1097 *
1098 * @throws IOException
1099 */
1100 public BioWordLexicon(File lexiconFile) throws IOException {
1101 // Create empty lexicon.
1102 super();
1103 if (lexiconFile == null) {
1104 // Load default word lexicon.
1105 loadLexicon(BioLemmatizer.class.getResource(lexiconPath), "utf-8");
1106 } else {
1107 loadLexicon(lexiconFile.toURI().toURL(), "utf-8");
1108 }
1109 }
1110 }
1111
1112 /** POSEntry: store different POS tags and the corresponding tagset label */
1113 class POSEntry {
1114 public Map<String, String> tagToTagSet;
1115
1116 /**
1117 * Construtor to initialize the class field by loading different POS tagsets
1118 */
1119 public POSEntry() {
1120 tagToTagSet = new HashMap<String, String>();
1121 // NUPOS tags
1122 Lexicon wordLexicon;
1123 try {
1124 wordLexicon = new DefaultWordLexicon();
1125 } catch (Exception e) {
1126 throw new RuntimeException(e);
1127 }
1128 addNewTagSet(Arrays.asList(wordLexicon.getCategories()), "NUPOS");
1129
1130 // PennPOS tags
1131 String mappingFileName = "PennPOStoNUPOS.mapping";
1132 InputStream is = BioLemmatizer.class.getResourceAsStream(mappingFileName);
1133 Map<String, String[]> mappingPennPOStoNUPOS;
1134 try {
1135 mappingPennPOStoNUPOS = BioLemmatizer.loadPOSMappingFile(is);
1136 } catch (IOException e) {
1137 throw new RuntimeException("Error while opening mapping file: " + mappingFileName, e);
1138 }
1139 addNewTagSet(mappingPennPOStoNUPOS.keySet(), "PennPOS");
1140 }
1141
1142 /**
1143 * Add new POS tagset
1144 *
1145 * @param tags
1146 * a set of POS tags
1147 * @param tagSetLabel
1148 * the corresponding tagset label
1149 */
1150 public void addNewTagSet(Collection<String> tags, String tagSetLabel) {
1151 for (String tag : tags) {
1152 tagToTagSet.put(tag, tagSetLabel);
1153 }
1154 }
1155
1156 /**
1157 * Retrieve the tagset label of the input POS tag
1158 *
1159 * @param category
1160 * an input POS tag
1161 * @return the corresponding POS tagset label
1162 */
1163 public String getTagSetLabel(String category) {
1164 String defaultLabel = "NONE";
1165 return tagToTagSet.containsKey(category) ? tagToTagSet.get(category) : defaultLabel;
1166 }
1167 }