001 /* 002 Copyright (c) 2012, Regents of the University of Colorado 003 All rights reserved. 004 005 Redistribution and use in source and binary forms, with or without modification, 006 are permitted provided that the following conditions are met: 007 008 * Redistributions of source code must retain the above copyright notice, this 009 list of conditions and the following disclaimer. 010 011 * Redistributions in binary form must reproduce the above copyright notice, 012 this list of conditions and the following disclaimer in the documentation 013 and/or other materials provided with the distribution. 014 015 * Neither the name of the University of Colorado nor the names of its 016 contributors may be used to endorse or promote products derived from this 017 software without specific prior written permission. 018 019 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 020 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 021 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 022 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 023 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 024 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 026 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 027 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 028 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 029 */ 030 031 package edu.ucdenver.ccp.nlp.biolemmatizer; 032 033 import java.io.BufferedReader; 034 import java.io.BufferedWriter; 035 import java.io.File; 036 import java.io.FileInputStream; 037 import java.io.FileNotFoundException; 038 import java.io.FileOutputStream; 039 import java.io.IOException; 040 import java.io.InputStream; 041 import java.io.InputStreamReader; 042 import java.io.OutputStreamWriter; 043 import java.nio.charset.Charset; 044 import java.nio.charset.CodingErrorAction; 045 import java.util.ArrayList; 046 import java.util.Arrays; 047 import java.util.Collection; 048 import java.util.HashMap; 049 import java.util.HashSet; 050 import java.util.List; 051 import java.util.Map; 052 import java.util.Set; 053 054 import org.kohsuke.args4j.CmdLineException; 055 import org.kohsuke.args4j.CmdLineParser; 056 057 import edu.northwestern.at.utils.corpuslinguistics.lemmatizer.EnglishLemmatizer; 058 import edu.northwestern.at.utils.corpuslinguistics.lemmatizer.Lemmatizer; 059 import edu.northwestern.at.utils.corpuslinguistics.lexicon.DefaultLexicon; 060 import edu.northwestern.at.utils.corpuslinguistics.lexicon.DefaultWordLexicon; 061 import edu.northwestern.at.utils.corpuslinguistics.lexicon.Lexicon; 062 import edu.northwestern.at.utils.corpuslinguistics.lexicon.LexiconEntry; 063 import edu.northwestern.at.utils.corpuslinguistics.partsofspeech.PartOfSpeechTags; 064 import edu.northwestern.at.utils.corpuslinguistics.tokenizer.PennTreebankTokenizer; 065 import edu.northwestern.at.utils.corpuslinguistics.tokenizer.WordTokenizer; 066 067 /** 068 * BioLemmatizer: Lemmatize a word in biomedical texts and return its lemma; the part of speech 069 * (POS) of the word is optional. 070 * 071 * <p> 072 * Usage: 073 * </p> 074 * 075 * <p> 076 * <code> 077 * java -Xmx1G -jar biolemmatizer-core-1.0-jar-with-dependencies.jar [-l] {@literal <input_string>} [POS tag] or<br> 078 * java -Xmx1G -jar biolemmatizer-core-1.0-jar-with-dependencies.jar [-l] -i {@literal <input_file_name> -o <output_file_name>} or<br> 079 * java -Xmx1G -jar biolemmatizer-core-1.0-jar-with-dependencies.jar [-l] -t<br> 080 * </code> 081 * </p> 082 * 083 * <p> 084 * Example: 085 * </p> 086 * 087 * <p> 088 * <code> 089 * java -Xmx1G -jar biolemmatizer-core-1.0-jar-with-dependencies.jar catalyses NNS 090 * </code> 091 * </p> 092 * 093 * <p> 094 * Please see the README file for more usage examples 095 * </p> 096 * 097 * @author Haibin Liu <Haibin.Liu@ucdenver.edu>, William A Baumgartner Jr 098 * <William.Baumgartner@ucdenver.edu> and Karin Verspoor <Karin.Verspoor@ucdenver.edu> 099 */ 100 101 public class BioLemmatizer { 102 /** Lemma separator character */ 103 public static String lemmaSeparator = "||"; 104 105 /** BioLemmatizer */ 106 public Lemmatizer lemmatizer; 107 108 /** Word lexicon for lemma lookup */ 109 public Lexicon wordLexicon; 110 111 /** NUPOS tags */ 112 public PartOfSpeechTags partOfSpeechTags; 113 114 /** Extract individual word parts from a contracted word. */ 115 public WordTokenizer spellingTokenizer; 116 117 /** Hierachical mapping file from PennPOS to NUPOS */ 118 public Map<String, String[]> mappingPennPOStoNUPOS; 119 120 /** Hierachical mapping file from major class to Penn Treebank POS */ 121 public Map<String, String[]> mappingMajorClasstoPennPOS; 122 123 /** the Part-Of-Speech mapping file */ 124 protected static String mappingFileName; 125 126 /** POSEntry object to retrieve POS tag information */ 127 public POSEntry posEntry; 128 129 /** 130 * Default constructor loads the lexicon from the classpath 131 */ 132 public BioLemmatizer() { 133 this(null); 134 } 135 136 /** 137 * Constructor to initialize the class fields 138 * 139 * @param lexiconFile 140 * a reference to the lexicon file to use. If null, the lexicon that comes with the 141 * BioLemmatizer distribution is loaded from the classpath 142 */ 143 public BioLemmatizer(File lexiconFile) { 144 145 // Get the default rule-based lemmatizer. 146 try { 147 lemmatizer = new MorphAdornerLemmatizer(); 148 } catch (Exception e) { 149 throw new RuntimeException(e); 150 } 151 // Get default word lexicon. 152 try { 153 wordLexicon = new BioWordLexicon(lexiconFile); 154 } catch (IOException e) { 155 throw new RuntimeException(e); 156 } 157 // Get the part of speech tags from the word lexicon. 158 partOfSpeechTags = wordLexicon.getPartOfSpeechTags(); 159 // Get spelling tokenizer. 160 spellingTokenizer = new PennTreebankTokenizer(); 161 // Set the lexicon which may provide lemmata. 162 lemmatizer.setLexicon(wordLexicon); 163 // Set the dictionary for checking lemmata after applying lemmatization 164 // rules. 165 lemmatizer.setDictionary(setDictionary(wordLexicon)); 166 167 // Specify the Part-Of-Speech mapping files 168 mappingFileName = "PennPOStoNUPOS.mapping"; 169 InputStream is = BioLemmatizer.class.getResourceAsStream(mappingFileName); 170 try { 171 mappingPennPOStoNUPOS = loadPOSMappingFile(is); 172 } catch (IOException e) { 173 throw new RuntimeException("Unable to load mapping: " + mappingFileName, e); 174 } 175 176 mappingFileName = "MajorClasstoPennPOS.mapping"; 177 is = BioLemmatizer.class.getResourceAsStream(mappingFileName); 178 try { 179 mappingMajorClasstoPennPOS = loadPOSMappingFile(is); 180 } catch (IOException e) { 181 throw new RuntimeException("Unable to load mapping: " + mappingFileName, e); 182 } 183 184 // Get the POS tagsets 185 posEntry = new POSEntry(); 186 } 187 188 /** 189 * Static method to load a Part-Of-Speech mapping file 190 * 191 * @param is 192 * InputStream of the mapping file 193 * @return a Map object that stores the hierachical mapping information in the file 194 * @throws IOException 195 */ 196 static Map<String, String[]> loadPOSMappingFile(InputStream is) throws IOException { 197 Map<String, String[]> mapping = new HashMap<String, String[]>(); 198 199 try { 200 InputStreamReader isr = new InputStreamReader(is); 201 BufferedReader input = new BufferedReader(isr); 202 203 String line = null; 204 while ((line = input.readLine()) != null) { 205 line = line.trim(); 206 String[] pair = line.split("\t"); 207 String[] mappingSet = pair[1].split(","); 208 mapping.put(pair[0], mappingSet); 209 // remove the first empty char with unicode FEFF 210 mapping.put(pair[0].replaceAll("^\\uFEFF", ""), mappingSet); 211 } 212 213 input.close(); 214 isr.close(); 215 } finally { 216 is.close(); 217 } 218 return mapping; 219 } 220 221 /** 222 * Create a dictionary from a word lexicon for validating lemmata resulted from lemmatization 223 * rules 224 * 225 * @param wordLexicon 226 * a word lexicon 227 * @return a set that contains a dictionary generated from the word lexicon 228 */ 229 private Set<String> setDictionary(Lexicon wordLexicon) { 230 Set<String> dictionarySet = new HashSet<String>(); 231 232 // generate dictionary from lexicon 233 String[] lexiconEntries = wordLexicon.getEntries(); 234 for (String entry : lexiconEntries) { 235 String[] lemmata = wordLexicon.getLemmata(entry); 236 for (String lemma : lemmata) { 237 dictionarySet.add(lemma.toLowerCase()); 238 } 239 } 240 241 return dictionarySet; 242 } 243 244 /** 245 * Retrieve an array of corresponding NUPOS tags of a Penn Treebank POS tag 246 * 247 * @param partOfSpeech 248 * a POS tag 249 * @return an array of corresponding NUPOS tags; 250 */ 251 private String[] getNUPOSTagFromPennPOS(String partOfSpeech) { 252 String[] nuPOSTag = mappingPennPOStoNUPOS.get(partOfSpeech.toUpperCase()); 253 return nuPOSTag != null ? nuPOSTag : new String[] { partOfSpeech }; 254 } 255 256 /** 257 * Retrieve an array of corresponding Penn Treebank POS tags of a NUPOS tag 258 * 259 * @param partOfSpeech 260 * a POS tag 261 * @return an array of corresponding Penn Treebank POS tags; 262 */ 263 private String[] getPennPOSFromNUPOS(String partOfSpeech) { 264 List<String> result = new ArrayList<String>(); 265 for (String key : mappingPennPOStoNUPOS.keySet()) { 266 for (String value : mappingPennPOStoNUPOS.get(key)) { 267 if (value.equals(partOfSpeech)) { 268 result.add(key); 269 break; 270 } 271 } 272 } 273 return result.size() != 0 ? result.toArray(new String[result.size()]) : new String[] { partOfSpeech }; 274 } 275 276 /** 277 * Retrieve sibling Penn Treebank POS tags of a Penn Treebank POS tag from the POS hierarchy 278 * 279 * @param partOfSpeech 280 * a Penn Treebank POS tag 281 * @return sibling Penn Treebank POS tags of the Penn Treebank POS tag 282 */ 283 private String[] getSiblingPennPOSTag(String partOfSpeech) { 284 // check if partOfSpeech exists in the hierarchy 285 boolean globalFlag = false; 286 for (String key : mappingMajorClasstoPennPOS.keySet()) { 287 String[] posTag = mappingMajorClasstoPennPOS.get(key); 288 for (String pos : posTag) { 289 if (pos.equals(partOfSpeech)) { 290 globalFlag = true; 291 break; 292 } 293 } 294 if (globalFlag) 295 break; 296 } 297 298 if (globalFlag) { 299 String foundKey = ""; 300 for (String key : mappingMajorClasstoPennPOS.keySet()) { 301 String[] posTag = mappingMajorClasstoPennPOS.get(key); 302 boolean localFlag = false; 303 for (String pos : posTag) { 304 if (pos.equals(partOfSpeech)) { 305 foundKey = key; 306 localFlag = true; 307 break; 308 } 309 } 310 if (localFlag) 311 break; 312 } 313 List<String> merge = new ArrayList<String>(); 314 for (String pos : mappingMajorClasstoPennPOS.get(foundKey)) { 315 if (!pos.equals(partOfSpeech)) 316 merge.add(pos); 317 } 318 return merge.toArray(new String[merge.size()]); 319 } else { 320 return new String[] { partOfSpeech }; 321 } 322 } 323 324 /** 325 * Retrieve sibling NUPOS tags of a Penn Treebank POS tag from the POS hierarchy 326 * 327 * @param partOfSpeech 328 * a Penn Treebank POS tag 329 * @return sibling NUPOS tags of the Penn Treebank POS tag 330 */ 331 private String[] getSiblingNUPOSTag(String partOfSpeech) { 332 // check if partOfSpeech exists in the hierarchy 333 boolean globalFlag = false; 334 for (String key : mappingMajorClasstoPennPOS.keySet()) { 335 String[] posTag = mappingMajorClasstoPennPOS.get(key); 336 for (String pos : posTag) { 337 if (pos.equals(partOfSpeech)) { 338 globalFlag = true; 339 break; 340 } 341 } 342 if (globalFlag) 343 break; 344 } 345 346 if (globalFlag) { 347 String foundKey = ""; 348 for (String key : mappingMajorClasstoPennPOS.keySet()) { 349 String[] posTag = mappingMajorClasstoPennPOS.get(key); 350 boolean localFlag = false; 351 for (String pos : posTag) { 352 if (pos.equals(partOfSpeech)) { 353 foundKey = key; 354 localFlag = true; 355 break; 356 } 357 } 358 if (localFlag) 359 break; 360 } 361 List<String> merge = new ArrayList<String>(); 362 for (String pos : mappingMajorClasstoPennPOS.get(foundKey)) { 363 if (!pos.equals(partOfSpeech)) { 364 merge.addAll(Arrays.asList(mappingPennPOStoNUPOS.get(pos.toUpperCase()))); 365 } 366 } 367 368 return merge.toArray(new String[merge.size()]); 369 } else { 370 return new String[] { partOfSpeech }; 371 } 372 } 373 374 /** 375 * Retrieve lemmas and the corresponding categories of the input string 376 * 377 * @param spelling 378 * an input string 379 * @return a Map object that stores lemmas and categories of the string; key: category, value: 380 * lemma 381 */ 382 private Map<String, String> getLemmasAndCategories(String spelling) { 383 Map<String, String> lemmasAndCategories = new HashMap<String, String>(); 384 LexiconEntry lexiconEntry = wordLexicon.getLexiconEntry(spelling); 385 if (lexiconEntry != null) 386 lemmasAndCategories = lexiconEntry.lemmata; 387 return lemmasAndCategories; 388 } 389 390 /** 391 * Clean up the raw lemma resulted from lemmatization rules 392 * 393 * @param lemma 394 * a raw lemma 395 * @return clean lemma 396 */ 397 private static String cleanUpLemma(String lemma) { 398 String newLemma = lemma; 399 String lastChar = lemma.substring(lemma.length() - 1); 400 if (lastChar.equals("'")) { 401 newLemma = lemma.substring(0, lemma.length() - 1); 402 } 403 return newLemma; 404 } 405 406 /** 407 * Convert special unicode characters into modern English spelling 408 * 409 * @param input 410 * an input string 411 * @return modern English spelling 412 */ 413 static String unicodeHandler(String input) { 414 // define the mapping between special unicode characters and modern 415 // English spelling 416 Map<String, String> specialUnicodeCharToModernEnglishMapping = new HashMap<String, String>(); 417 418 specialUnicodeCharToModernEnglishMapping.put("u00E6", "ae"); 419 specialUnicodeCharToModernEnglishMapping.put("u0153", "oe"); 420 specialUnicodeCharToModernEnglishMapping.put("u00E4", "a"); 421 specialUnicodeCharToModernEnglishMapping.put("u00E0", "a"); 422 specialUnicodeCharToModernEnglishMapping.put("u00E1", "a"); 423 specialUnicodeCharToModernEnglishMapping.put("u0113", "e"); 424 specialUnicodeCharToModernEnglishMapping.put("u00E9", "e"); 425 specialUnicodeCharToModernEnglishMapping.put("u00E8", "e"); 426 specialUnicodeCharToModernEnglishMapping.put("u00EB", "e"); 427 specialUnicodeCharToModernEnglishMapping.put("u00EF", "i"); 428 specialUnicodeCharToModernEnglishMapping.put("u00F1", "n"); 429 specialUnicodeCharToModernEnglishMapping.put("u014D", "o"); 430 specialUnicodeCharToModernEnglishMapping.put("u00F6", "o"); 431 specialUnicodeCharToModernEnglishMapping.put("u00F4", "o"); 432 specialUnicodeCharToModernEnglishMapping.put("u016B", "u"); 433 specialUnicodeCharToModernEnglishMapping.put("u00FA", "u"); 434 435 String output = input; 436 for (String unicode : specialUnicodeCharToModernEnglishMapping.keySet()) { 437 String regex = "\\" + unicode; 438 output = output.replaceAll(regex, specialUnicodeCharToModernEnglishMapping.get(unicode)); 439 } 440 441 return output; 442 } 443 444 /** 445 * Lemmatize a string with POS tag using Lexicon only 446 * 447 * @param spelling 448 * an input string 449 * @param partOfSpeech 450 * POS tag of the input string 451 * @return a LemmaEntry object containing lemma and POS information 452 */ 453 public LemmataEntry lemmatizeByLexicon(String spelling, String partOfSpeech) { 454 Map<String, String> lemmataAndLemmataTag = new HashMap<String, String>(); 455 String lemmata = spelling; 456 String lemmataTag; 457 if (partOfSpeech == null) 458 partOfSpeech = ""; 459 // default POS tag = NONE 460 if (partOfSpeech.trim().length() == 0) 461 lemmataTag = "NONE"; 462 else 463 lemmataTag = partOfSpeech; 464 465 // check the POS tagset 466 String tagSetLabel = posEntry.getTagSetLabel(partOfSpeech); 467 468 String[] nuPOSTag = getNUPOSTagFromPennPOS(partOfSpeech); 469 470 // Different lexicon search methods are tried in order to 471 // augument the use of 472 // lexicon 473 String lemma = "*"; 474 String category = "*"; 475 476 if (tagSetLabel.equals("PennPOS")) { 477 // direct PennPOS tag search 478 lemma = wordLexicon.getLemma(spelling.toLowerCase(), partOfSpeech); 479 if (lemma.equals("*")) { 480 lemma = wordLexicon.getLemma(spelling.toUpperCase(), partOfSpeech); 481 } 482 if (!lemma.equals("*")) { 483 lemmata = lemma; 484 category = partOfSpeech; 485 // System.out.println("found in the Penn direct lexicon: "+lemma); 486 } 487 // PennPOS tag hierachical search 488 if (lemma.equals("*")) { 489 String[] hierarachicalPennPOSTag = getSiblingPennPOSTag(partOfSpeech); 490 for (String pos : hierarachicalPennPOSTag) { 491 lemma = wordLexicon.getLemma(spelling.toLowerCase(), pos); 492 if (lemma.equals("*")) { 493 lemma = wordLexicon.getLemma(spelling.toUpperCase(), pos); 494 } 495 if (!lemma.equals("*")) { 496 lemmata = lemma; 497 category = pos; 498 // System.out.println("found in the Penn hierachical lexicon: "+lemma); 499 break; 500 } 501 } 502 } 503 // Turn PennPOS tag into NUSPOS tag and search 504 if (lemma.equals("*")) { 505 for (String pos : nuPOSTag) { 506 lemma = wordLexicon.getLemma(spelling.toLowerCase(), pos); 507 if (lemma.equals("*")) { 508 lemma = wordLexicon.getLemma(spelling.toUpperCase(), pos); 509 } 510 if (!lemma.equals("*")) { 511 lemmata = lemma; 512 category = pos; 513 // System.out.println("found in the converted NU direct lexicon: "+lemma); 514 break; 515 } 516 } 517 } 518 // NUSPOS tag hierachical search 519 if (lemma.equals("*")) { 520 String[] hierarachicalNUPOSTag = getSiblingNUPOSTag(partOfSpeech); 521 for (String pos : hierarachicalNUPOSTag) { 522 lemma = wordLexicon.getLemma(spelling.toLowerCase(), pos); 523 if (lemma.equals("*")) { 524 lemma = wordLexicon.getLemma(spelling.toUpperCase(), pos); 525 } 526 if (!lemma.equals("*")) { 527 lemmata = lemma; 528 category = pos; 529 // System.out.println("found in the converted NU hierachical lexicon: "+lemma); 530 break; 531 } 532 } 533 } 534 } else if (tagSetLabel.equals("NUPOS")) { 535 // direct NUPOS tag search 536 lemma = wordLexicon.getLemma(spelling.toLowerCase(), partOfSpeech); 537 if (lemma.equals("*")) { 538 lemma = wordLexicon.getLemma(spelling.toUpperCase(), partOfSpeech); 539 } 540 if (!lemma.equals("*")) { 541 lemmata = lemma; 542 category = partOfSpeech; 543 // System.out.println("found in the NU direct lexicon: "+lemma); 544 } 545 // NUPOS tag hierachical search 546 if (lemma.equals("*")) { 547 String[] hierarachicalNUPOSTag = getSiblingNUPOSTag(getPennPOSFromNUPOS(partOfSpeech)[0]); 548 for (String pos : hierarachicalNUPOSTag) { 549 lemma = wordLexicon.getLemma(spelling.toLowerCase(), pos); 550 if (lemma.equals("*")) { 551 lemma = wordLexicon.getLemma(spelling.toUpperCase(), pos); 552 } 553 if (!lemma.equals("*")) { 554 lemmata = lemma; 555 category = pos; 556 // System.out.println("found in the NU hierachical lexicon: "+lemma); 557 break; 558 } 559 } 560 } 561 } 562 563 // backup lexicon lookup process: search without POS tags, return all 564 // lemmas 565 Map<String, String> lemmasAndCategories = new HashMap<String, String>(); 566 if (tagSetLabel.equals("NONE") || lemma.equals("*")) { 567 // if ( tagSetLabel.equals("NONE") ) { 568 lemmasAndCategories = getLemmasAndCategories(spelling.toLowerCase()); 569 if (lemmasAndCategories.isEmpty()) { 570 lemmasAndCategories = getLemmasAndCategories(spelling.toUpperCase()); 571 } 572 } 573 574 // found the Lemma 575 if (!lemmasAndCategories.isEmpty()) { 576 lemmataAndLemmataTag = lemmasAndCategories; 577 // System.out.println("found in the lexicon"); 578 } else if (!lemma.equals("*")) { 579 lemmata = lemma; 580 lemmataTag = category; 581 lemmataAndLemmataTag.put(lemmataTag, lemmata); 582 } 583 // lexicon has been checked but nothing found, return original input 584 else 585 lemmataAndLemmataTag.put(lemmataTag, lemmata); 586 587 return new LemmataEntry(lemmataAndLemmataTag, posEntry); 588 } 589 590 /** 591 * Lemmatize a string with POS tag using lemmatization rules only 592 * 593 * @param spelling 594 * an input string 595 * @param partOfSpeech 596 * POS tag of the input string 597 * @return a LemmaEntry object containing lemma and POS information 598 */ 599 public LemmataEntry lemmatizeByRules(String spelling, String partOfSpeech) { 600 // option to have a dictionary for rule-based lemmatizer to validate results 601 // lemmatizer.setDictionary(new HashSet<String>()); 602 603 Map<String, String> lemmataAndLemmataTag = new HashMap<String, String>(); 604 String lemmata = spelling; 605 String lemmataTag; 606 // default POS tag = NONE 607 if (partOfSpeech == null) 608 partOfSpeech = ""; 609 if (partOfSpeech.trim().length() == 0) 610 lemmataTag = "NONE"; 611 else 612 lemmataTag = partOfSpeech; 613 614 String[] nuPOSTag = getNUPOSTagFromPennPOS(partOfSpeech); 615 616 // Use rule-based lemmatizer. 617 618 // Get lemmatization word class for part of speech, 619 String lemmaClass = ""; 620 for (String pos : nuPOSTag) { 621 lemmaClass = partOfSpeechTags.getLemmaWordClass(pos); 622 if (lemmaClass.length() != 0) { 623 break; 624 } 625 } 626 627 // Do not lemmatize words which should not be lemmatized, ?including 628 // proper names?. 629 if (lemmatizer.cantLemmatize(spelling) || lemmaClass.equals("none")) { 630 } else { 631 // Try compound word exceptions list first. 632 lemmata = lemmatizer.lemmatize(spelling, "compound"); 633 634 // If lemma not found, keep trying. 635 if (lemmata.equalsIgnoreCase(spelling)) { 636 // Extract individual word parts. 637 // May be more than one for a 638 // contraction. 639 List<String> wordList = spellingTokenizer.extractWords(spelling); 640 641 // If just one word part, get its lemma. 642 if (!partOfSpeechTags.isCompoundTag(partOfSpeech) || (wordList.size() == 1)) { 643 if (lemmaClass.length() == 0) { 644 lemmata = lemmatizer.lemmatize(spelling); 645 } else { 646 lemmata = lemmatizer.lemmatize(spelling, lemmaClass); 647 } 648 } 649 // More than one word part. 650 // Get lemma for each part and 651 // concatenate them with the 652 // lemma separator to form a 653 // compound lemma. 654 else { 655 lemmata = ""; 656 String[] posTags = partOfSpeechTags.splitTag(partOfSpeech); 657 658 if (posTags.length == wordList.size()) { 659 for (int i = 0; i < wordList.size(); i++) { 660 String wordPiece = wordList.get(i); 661 if (i > 0) { 662 lemmata = lemmata + lemmaSeparator; 663 } 664 665 LemmataEntry lemmaPiece = lemmatizeByRules(wordPiece, posTags[i]); 666 667 lemmata = lemmata + lemmaPiece.lemmasToString(); 668 } 669 } 670 } 671 } 672 } 673 674 lemmataAndLemmataTag.put(lemmataTag, lemmata); 675 676 return new LemmataEntry(lemmataAndLemmataTag, posEntry); 677 678 } 679 680 /** 681 * Lemmatize a string with POS tag using both lexicon lookup and lemmatization rules This is the 682 * preferred method as it gives the best lemmatization performance 683 * 684 * @param spelling 685 * an input string 686 * @param partOfSpeech 687 * POS tag of the input string 688 * @return a LemmaEntry object containing lemma and POS information 689 */ 690 public LemmataEntry lemmatizeByLexiconAndRules(String spelling, String partOfSpeech) { 691 692 Map<String, String> lemmataAndLemmataTag = new HashMap<String, String>(); 693 String lemmata = spelling; 694 String lemmataTag; 695 // default POS tag = NONE 696 if (partOfSpeech == null) 697 partOfSpeech = ""; 698 if (partOfSpeech.trim().length() == 0) 699 lemmataTag = "NONE"; 700 else 701 lemmataTag = partOfSpeech; 702 703 // check the POS tagset 704 String tagSetLabel = posEntry.getTagSetLabel(partOfSpeech); 705 706 String[] nuPOSTag = getNUPOSTagFromPennPOS(partOfSpeech); 707 708 // Try lexicon first, different search methods are tried in order to 709 // augument the use of 710 // lexicon 711 String lemma = "*"; 712 String category = "*"; 713 714 if (tagSetLabel.equals("PennPOS")) { 715 // direct PennPOS tag search 716 lemma = wordLexicon.getLemma(spelling.toLowerCase(), partOfSpeech); 717 if (lemma.equals("*")) { 718 lemma = wordLexicon.getLemma(spelling.toUpperCase(), partOfSpeech); 719 } 720 if (!lemma.equals("*")) { 721 category = partOfSpeech; 722 // System.out.println("found in the Penn direct lexicon: "+lemma); 723 } 724 // PennPOS tag hierachical search 725 if (lemma.equals("*")) { 726 String[] hierarachicalPennPOSTag = getSiblingPennPOSTag(partOfSpeech); 727 for (String pos : hierarachicalPennPOSTag) { 728 lemma = wordLexicon.getLemma(spelling.toLowerCase(), pos); 729 if (lemma.equals("*")) { 730 lemma = wordLexicon.getLemma(spelling.toUpperCase(), pos); 731 } 732 if (!lemma.equals("*")) { 733 category = pos; 734 // System.out.println("found in the Penn hierachical lexicon: "+lemma); 735 break; 736 } 737 } 738 } 739 // Turn PennPOS tag into NUSPOS tag and search 740 if (lemma.equals("*")) { 741 for (String pos : nuPOSTag) { 742 lemma = wordLexicon.getLemma(spelling.toLowerCase(), pos); 743 if (lemma.equals("*")) { 744 lemma = wordLexicon.getLemma(spelling.toUpperCase(), pos); 745 } 746 if (!lemma.equals("*")) { 747 category = pos; 748 // System.out.println("found in the converted NU direct lexicon: "+lemma); 749 break; 750 } 751 } 752 } 753 // NUSPOS tag hierachical search 754 if (lemma.equals("*")) { 755 String[] hierarachicalNUPOSTag = getSiblingNUPOSTag(partOfSpeech); 756 for (String pos : hierarachicalNUPOSTag) { 757 lemma = wordLexicon.getLemma(spelling.toLowerCase(), pos); 758 if (lemma.equals("*")) { 759 lemma = wordLexicon.getLemma(spelling.toUpperCase(), pos); 760 } 761 if (!lemma.equals("*")) { 762 category = pos; 763 // System.out.println("found in the converted NU hierachical lexicon: "+lemma); 764 break; 765 } 766 } 767 } 768 } else if (tagSetLabel.equals("NUPOS")) { 769 // direct NUPOS tag search 770 lemma = wordLexicon.getLemma(spelling.toLowerCase(), partOfSpeech); 771 if (lemma.equals("*")) { 772 lemma = wordLexicon.getLemma(spelling.toUpperCase(), partOfSpeech); 773 } 774 if (!lemma.equals("*")) { 775 category = partOfSpeech; 776 // System.out.println("found in the NU direct lexicon: "+lemma); 777 } 778 // NUPOS tag hierachical search 779 if (lemma.equals("*")) { 780 String[] hierarachicalNUPOSTag = getSiblingNUPOSTag(getPennPOSFromNUPOS(partOfSpeech)[0]); 781 for (String pos : hierarachicalNUPOSTag) { 782 lemma = wordLexicon.getLemma(spelling.toLowerCase(), pos); 783 if (lemma.equals("*")) { 784 lemma = wordLexicon.getLemma(spelling.toUpperCase(), pos); 785 } 786 if (!lemma.equals("*")) { 787 category = pos; 788 // System.out.println("found in the NU hierachical lexicon: "+lemma); 789 break; 790 } 791 } 792 } 793 } 794 795 // if tagSetLabel is NONE, invoke the backup lexicon lookup process: 796 // search without POS tags, return all lemmas 797 Map<String, String> lemmasAndCategories = new HashMap<String, String>(); 798 // if ( tagSetLabel.equals("NONE") || lemma.equals( "*" ) ) { 799 if (tagSetLabel.equals("NONE")) { 800 lemmasAndCategories = getLemmasAndCategories(spelling.toLowerCase()); 801 if (lemmasAndCategories.isEmpty()) { 802 lemmasAndCategories = getLemmasAndCategories(spelling.toUpperCase()); 803 } 804 } 805 806 // found the Lemma 807 if (!lemmasAndCategories.isEmpty()) { 808 lemmataAndLemmataTag = lemmasAndCategories; 809 // System.out.println("found in the lexicon"); 810 } else if (!lemma.equals("*")) { 811 lemmata = lemma; 812 lemmataTag = category; 813 lemmataAndLemmataTag.put(lemmataTag, lemmata); 814 } 815 // for testing purpose to test lexicon only 816 // else lemmataAndLemmataTag.put(lemmataTag, lemmata) ; 817 818 // Lemma not found in word lexicon. Use rule-based lemmatizer. 819 else { 820 821 // Get lemmatization word class for part of speech, 822 String lemmaClass = ""; 823 for (String pos : nuPOSTag) { 824 lemmaClass = partOfSpeechTags.getLemmaWordClass(pos); 825 if (lemmaClass.length() != 0) { 826 break; 827 } 828 } 829 830 // Do not lemmatize words which should not be lemmatized, ?including 831 // proper names?. 832 if (lemmatizer.cantLemmatize(spelling) || lemmaClass.equals("none")) { 833 } else { 834 // Try compound word exceptions list first. 835 lemmata = lemmatizer.lemmatize(spelling, "compound"); 836 837 // If lemma not found, keep trying. 838 if (lemmata.equalsIgnoreCase(spelling)) { 839 // Extract individual word parts. 840 // May be more than one for a 841 // contraction. 842 List<String> wordList = spellingTokenizer.extractWords(spelling); 843 844 // If just one word part, get its lemma. 845 if (!partOfSpeechTags.isCompoundTag(partOfSpeech) || (wordList.size() == 1)) { 846 if (lemmaClass.length() == 0) { 847 lemmata = lemmatizer.lemmatize(spelling); 848 } else { 849 lemmata = lemmatizer.lemmatize(spelling, lemmaClass); 850 } 851 } 852 // More than one word part. 853 // Get lemma for each part and 854 // concatenate them with the 855 // lemma separator to form a 856 // compound lemma. 857 else { 858 lemmata = ""; 859 String[] posTags = partOfSpeechTags.splitTag(partOfSpeech); 860 861 if (posTags.length == wordList.size()) { 862 for (int i = 0; i < wordList.size(); i++) { 863 String wordPiece = wordList.get(i); 864 if (i > 0) { 865 lemmata = lemmata + lemmaSeparator; 866 } 867 868 LemmataEntry lemmaPiece = lemmatizeByLexiconAndRules(wordPiece, posTags[i]); 869 870 lemmata = lemmata + lemmaPiece.lemmasToString(); 871 } 872 } 873 } 874 } 875 } 876 877 lemmataAndLemmataTag.put(lemmataTag, lemmata); 878 } 879 880 return new LemmataEntry(lemmataAndLemmataTag, posEntry); 881 } 882 883 /** 884 * Input arguments are parsed into a {@link BioLemmatizerCmdOpts} object. Valid input arguments 885 * include: 886 * 887 * <pre> 888 * VAL : Single input to be lemmatized 889 * VAL : Part of speech of the single input to be lemmatized 890 * -f VAL : optional path to a lexicon file. If not set, the default lexicon 891 * available on the classpath is used 892 * -i VAL : the path to the input file 893 * -l : if present, only the lemma is returned (part-of-speech information is 894 * suppressed) 895 * -o VAL : the path to the output file 896 * -t : if present, the interactive mode is used 897 * </pre> 898 * 899 * 900 * 901 * @param args 902 */ 903 public static void main(String[] args) { 904 905 BioLemmatizerCmdOpts options = new BioLemmatizerCmdOpts(); 906 CmdLineParser parser = new CmdLineParser(options); 907 try { 908 parser.parseArgument(args); 909 } catch (CmdLineException e) { 910 System.err.println(e.getMessage()); 911 parser.printUsage(System.err); 912 return; 913 } 914 915 File lexiconFile = options.getLexiconFile(); 916 BioLemmatizer bioLemmatizer = new BioLemmatizer(lexiconFile); 917 boolean americanize = options.americanizedLemma(); 918 boolean outputLemmaOnly = options.outputLemmaOnly(); 919 boolean useInteractiveMode = options.useInteractiveMode(); 920 String inputStr = options.getInputStr(); 921 if (inputStr != null) 922 inputStr = inputStr.trim(); 923 String inputStrPos = options.getInputStrPos(); 924 File inputFile = options.getInputFile(); 925 File outputFile = options.getOutputFile(); 926 System.out.println("========================================================="); 927 System.out.println("========================================================="); 928 System.out.println("========================================================="); 929 System.out.println("Running BioLemmatizer...."); 930 try { 931 if (useInteractiveMode) { 932 runInteractiveMode(bioLemmatizer, outputLemmaOnly, americanize); 933 } else if (inputStr != null) { 934 LemmataEntry lemmata; 935 if(americanize) { 936 lemmata = bioLemmatizer.lemmatizeByLexiconAndRules(new Americanize().americanize(unicodeHandler(inputStr)), inputStrPos); 937 } 938 else 939 lemmata = bioLemmatizer.lemmatizeByLexiconAndRules(unicodeHandler(inputStr), inputStrPos); 940 if (outputLemmaOnly) { 941 System.out.println("The lemma for '" +inputStr+ "' is: " + lemmata.lemmasToString()); 942 } else { 943 System.out.println("The lemma for '" +inputStr+ "' is: " + lemmata); 944 } 945 } else if (inputFile != null) { 946 if (outputFile == null) { 947 System.err.println("Output file must be set if the input file parameter is used."); 948 parser.printUsage(System.err); 949 } 950 processInputFile(inputFile, outputFile, bioLemmatizer, outputLemmaOnly, americanize); 951 } else { 952 System.err.println("Invalid input parameters..."); 953 parser.printUsage(System.err); 954 } 955 } catch (IOException e) { 956 throw new RuntimeException(e); 957 } 958 System.out.println("========================================================="); 959 System.out.println("========================================================="); 960 System.out.println("========================================================="); 961 } 962 963 /** 964 * @param inputFile 965 * @param outputFile 966 * @param bioLemmatizer 967 * @param outputLemmaOnly 968 * @throws IOException 969 */ 970 private static void processInputFile(File inputFile, File outputFile, BioLemmatizer bioLemmatizer, 971 boolean outputLemmaOnly, boolean americanize) throws IOException { 972 Americanize convert = null; 973 if(americanize) 974 convert = new Americanize(); 975 BufferedReader input; 976 BufferedWriter output; 977 978 try { 979 // input = FileReaderUtil.initBufferedReader(inputFile, CharacterEncoding.UTF_8); 980 input = new BufferedReader(new InputStreamReader(new FileInputStream(inputFile), Charset.forName("UTF-8") 981 .newDecoder().onMalformedInput(CodingErrorAction.REPORT) 982 .onUnmappableCharacter(CodingErrorAction.REPORT))); 983 } catch (FileNotFoundException e) { 984 throw new RuntimeException("Unable to open the input file: " + inputFile.getAbsolutePath(), e); 985 } 986 987 try { 988 // output = FileWriterUtil.initBufferedWriter(outputFile, CharacterEncoding.UTF_8, 989 // WriteMode.OVERWRITE, FileSuffixEnforcement.OFF); 990 output = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outputFile, false), Charset 991 .forName("UTF-8").newEncoder().onMalformedInput(CodingErrorAction.REPORT) 992 .onUnmappableCharacter(CodingErrorAction.REPORT))); 993 } catch (FileNotFoundException e) { 994 throw new RuntimeException("Unable to open the output file: " + outputFile.getAbsolutePath(), e); 995 } 996 997 String line = null; 998 999 while ((line = input.readLine()) != null) { 1000 if (line.trim().length() == 0) { 1001 output.write("\n"); 1002 continue; 1003 } 1004 line = line.trim(); 1005 String[] pair = line.split("\t"); 1006 String pos; 1007 if (pair.length == 1) { 1008 pos = ""; 1009 } else { 1010 pos = pair[1]; 1011 } 1012 LemmataEntry lemmata; 1013 if(americanize) 1014 lemmata = bioLemmatizer.lemmatizeByLexiconAndRules(convert.americanize(unicodeHandler(pair[0])), pos); 1015 else 1016 lemmata = bioLemmatizer.lemmatizeByLexiconAndRules(unicodeHandler(pair[0]), pos); 1017 String result; 1018 if (outputLemmaOnly) { 1019 result = line + "\t" + lemmata.lemmasToString() + "\n"; 1020 } else { 1021 result = line + "\t" + lemmata + "\n"; 1022 } 1023 output.write(result); 1024 } 1025 // close input 1026 input.close(); 1027 // close output 1028 output.close(); 1029 } 1030 1031 private static void runInteractiveMode(BioLemmatizer bioLemmatizer, boolean outputLemmaOnly, boolean americanize) throws IOException { 1032 Americanize convert = null; 1033 if(americanize) 1034 convert = new Americanize(); 1035 BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); 1036 String input; 1037 System.out 1038 .println("Running BioLemmatizer in interactive mode. Please type a word to be lemmatized with an optional part-of-speech, e.g. \"run\" or \"run NN\""); 1039 while ((input = in.readLine()) != null && input.length() != 0) { 1040 String[] arguments = input.split("\\s"); 1041 if (arguments.length > 2) { 1042 System.out.println("Only one word to be lemmatized (with or without POS) is allowed"); 1043 System.exit(0); 1044 } 1045 String spelling = arguments[0].trim(); 1046 String partOfSpeech = (arguments.length == 2) ? arguments[1].trim() : null; 1047 LemmataEntry lemmata; 1048 if(americanize) 1049 lemmata = bioLemmatizer.lemmatizeByLexiconAndRules(convert.americanize(unicodeHandler(spelling)), partOfSpeech); 1050 else 1051 lemmata = bioLemmatizer.lemmatizeByLexiconAndRules(unicodeHandler(spelling), partOfSpeech); 1052 if (outputLemmaOnly) { 1053 System.out.println(lemmata.lemmasToString()); 1054 } else { 1055 System.out.println(lemmata); 1056 } 1057 } 1058 } 1059 } 1060 1061 /** Lemmatizer for English. */ 1062 class MorphAdornerLemmatizer extends EnglishLemmatizer { 1063 /** list of detachment rules. */ 1064 protected static String rulesFileName = "englishrules.txt"; 1065 1066 /** 1067 * Create an English lemmatizer. 1068 * 1069 * @throws Exception 1070 * because the {@link EnglishLemmatizer} constructor throws Exception 1071 * 1072 */ 1073 public MorphAdornerLemmatizer() throws Exception { 1074 // release the rules of original MorphAdorner Lemmatizer 1075 rules.clear(); 1076 // load new rules 1077 try { 1078 loadRules(BioLemmatizer.class.getResource(rulesFileName), "utf-8"); 1079 } catch (IOException e) { 1080 throw new RuntimeException("Unable to load English rules file.", e); 1081 } 1082 // release the irregularForm file of original MorphAdorner Lemmatizer 1083 // the irregular English forms are integrated into current Lexicon 1084 irregularForms.clear(); 1085 } 1086 } 1087 1088 /** 1089 * BioWordLexicon: Biomedical word Lexicon which extends MorphAdorner's English word lexicon. 1090 */ 1091 class BioWordLexicon extends DefaultLexicon { 1092 /** Resource path to word lexicon. */ 1093 protected static final String lexiconPath = "lexicon.lex"; 1094 1095 /** 1096 * Create an empty lexicon. 1097 * 1098 * @throws IOException 1099 */ 1100 public BioWordLexicon(File lexiconFile) throws IOException { 1101 // Create empty lexicon. 1102 super(); 1103 if (lexiconFile == null) { 1104 // Load default word lexicon. 1105 loadLexicon(BioLemmatizer.class.getResource(lexiconPath), "utf-8"); 1106 } else { 1107 loadLexicon(lexiconFile.toURI().toURL(), "utf-8"); 1108 } 1109 } 1110 } 1111 1112 /** POSEntry: store different POS tags and the corresponding tagset label */ 1113 class POSEntry { 1114 public Map<String, String> tagToTagSet; 1115 1116 /** 1117 * Construtor to initialize the class field by loading different POS tagsets 1118 */ 1119 public POSEntry() { 1120 tagToTagSet = new HashMap<String, String>(); 1121 // NUPOS tags 1122 Lexicon wordLexicon; 1123 try { 1124 wordLexicon = new DefaultWordLexicon(); 1125 } catch (Exception e) { 1126 throw new RuntimeException(e); 1127 } 1128 addNewTagSet(Arrays.asList(wordLexicon.getCategories()), "NUPOS"); 1129 1130 // PennPOS tags 1131 String mappingFileName = "PennPOStoNUPOS.mapping"; 1132 InputStream is = BioLemmatizer.class.getResourceAsStream(mappingFileName); 1133 Map<String, String[]> mappingPennPOStoNUPOS; 1134 try { 1135 mappingPennPOStoNUPOS = BioLemmatizer.loadPOSMappingFile(is); 1136 } catch (IOException e) { 1137 throw new RuntimeException("Error while opening mapping file: " + mappingFileName, e); 1138 } 1139 addNewTagSet(mappingPennPOStoNUPOS.keySet(), "PennPOS"); 1140 } 1141 1142 /** 1143 * Add new POS tagset 1144 * 1145 * @param tags 1146 * a set of POS tags 1147 * @param tagSetLabel 1148 * the corresponding tagset label 1149 */ 1150 public void addNewTagSet(Collection<String> tags, String tagSetLabel) { 1151 for (String tag : tags) { 1152 tagToTagSet.put(tag, tagSetLabel); 1153 } 1154 } 1155 1156 /** 1157 * Retrieve the tagset label of the input POS tag 1158 * 1159 * @param category 1160 * an input POS tag 1161 * @return the corresponding POS tagset label 1162 */ 1163 public String getTagSetLabel(String category) { 1164 String defaultLabel = "NONE"; 1165 return tagToTagSet.containsKey(category) ? tagToTagSet.get(category) : defaultLabel; 1166 } 1167 }