001 /* 002 Copyright (c) 2012, Regents of the University of Colorado 003 All rights reserved. 004 005 Redistribution and use in source and binary forms, with or without modification, 006 are permitted provided that the following conditions are met: 007 008 * Redistributions of source code must retain the above copyright notice, this 009 list of conditions and the following disclaimer. 010 011 * Redistributions in binary form must reproduce the above copyright notice, 012 this list of conditions and the following disclaimer in the documentation 013 and/or other materials provided with the distribution. 014 015 * Neither the name of the University of Colorado nor the names of its 016 contributors may be used to endorse or promote products derived from this 017 software without specific prior written permission. 018 019 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 020 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 021 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 022 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 023 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 024 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 026 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 027 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 028 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 029 */ 030 package edu.ucdenver.ccp.nlp.biolemmatizer.uima; 031 032 import java.util.ArrayList; 033 import java.util.List; 034 035 import org.apache.uima.cas.CASException; 036 import org.apache.uima.fit.util.JCasUtil; 037 import org.apache.uima.jcas.JCas; 038 import org.apache.uima.jcas.cas.FSArray; 039 import org.apache.uima.jcas.cas.StringArray; 040 import org.apache.uima.jcas.tcas.Annotation; 041 042 import edu.ucdenver.ccp.uima.shims.annotation.AnnotationDataExtractor; 043 import edu.ucdenver.ccp.uima.shims.annotation.Span; 044 import edu.ucdenver.ccp.uima.shims.annotation.syntactic.token.Lemma; 045 import edu.ucdenver.ccp.uima.shims.annotation.syntactic.token.LemmaDecorator; 046 047 /** 048 * This simple implementation of the {@link LemmaDecorator} interface adds a new 049 * {@link LemmaAnnotation} to the CAS for each token annotation processed. 050 * 051 * @author Colorado Computational Pharmacology, UC Denver; ccpsupport@ucdenver.edu 052 * 053 */ 054 public class DefaultLemmaDecorator implements LemmaDecorator { 055 056 /** 057 * @return an initialized {@link LemmaAnnotation} 058 * @see edu.ucdenver.ccp.uima.shims.annotation.AnnotationDecorator#newAnnotation(org.apache.uima.jcas.JCas, 059 * java.lang.String, edu.ucdenver.ccp.uima.shims.annotation.Span) 060 */ 061 @Override 062 public Annotation newAnnotation(JCas jcas, @SuppressWarnings("unused") String type, Span span) { 063 LemmaAnnotation lemmaAnnot = new LemmaAnnotation(jcas, span.getSpanStart(), span.getSpanEnd()); 064 lemmaAnnot.addToIndexes(); 065 return lemmaAnnot; 066 } 067 068 /** 069 * In the case of the {@link DefaultLemmaDecorator}, the annotation to decorate is a 070 * {@link LemmaAnnotation} with the same span as the token that was used to process the lemma. 071 * This method looks to see if that {@link LemmaAnnotation} exists. If it does it is returned. 072 * If it does not exist, then a new {@link LemmaAnnotation} is created and then returned. 073 * 074 * @param tokenAnnotation 075 * in this case, the input annotation represents the token annotation whose covered 076 * text was lemmatized 077 * 078 * @see edu.ucdenver.ccp.uima.shims.annotation.AnnotationDecorator#getAnnotationToDecorate(org.apache.uima.jcas.tcas.Annotation, 079 * edu.ucdenver.ccp.uima.shims.annotation.AnnotationDataExtractor) 080 */ 081 @Override 082 public Annotation getAnnotationToDecorate(Annotation tokenAnnotation, 083 @SuppressWarnings("unused") AnnotationDataExtractor annotationDataExtractor) { 084 JCas jCas = null; 085 try { 086 jCas = tokenAnnotation.getCAS().getJCas(); 087 } catch (CASException e) { 088 throw new IllegalStateException(e); 089 } 090 List<LemmaAnnotation> existingLemmaAnnotations = JCasUtil.selectCovered(jCas, LemmaAnnotation.class, 091 tokenAnnotation); 092 if (existingLemmaAnnotations.isEmpty()) { 093 return newAnnotation(jCas, null, new Span(tokenAnnotation.getBegin(), tokenAnnotation.getEnd())); 094 } 095 if (existingLemmaAnnotations.size() == 1) { 096 return existingLemmaAnnotations.get(0); 097 } 098 throw new IllegalStateException("Multiple LemmaAnnotations covering: " + tokenAnnotation.toString(0)); 099 } 100 101 /** 102 * 103 * @see edu.ucdenver.ccp.uima.shims.annotation.AnnotationDecorator#decorateAnnotation(org.apache 104 * .uima.jcas.tcas.Annotation, java.lang.String, java.lang.Object) 105 */ 106 @Override 107 public void decorateAnnotation(Annotation annotation, @SuppressWarnings("unused") String attributeType, Lemma lemma) { 108 insertLemma(annotation, lemma); 109 } 110 111 /** 112 * Inserts information representing the input {@link Lemma} into the input {@link Annotation} 113 * which is assumed to be of type {@link LemmaAnnotation} in this instance. 114 * 115 * @throws IllegalArgumentException 116 * if the input {@link Annotation} is not a {@link LemmaAnnotation} 117 * 118 * @see edu.ucdenver.ccp.uima.shims.annotation.syntactic.token.LemmaDecorator#insertLemma(org.apache.uima.jcas.tcas.Annotation, 119 * edu.ucdenver.ccp.uima.shims.annotation.syntactic.token.Lemma) 120 */ 121 @Override 122 public void insertLemma(Annotation annotation, Lemma lemma) { 123 checkAnnotationType(annotation); 124 LemmaAnnotation lemmaAnnot = (LemmaAnnotation) annotation; 125 try { 126 addLemma(lemmaAnnot, lemma); 127 } catch (CASException e) { 128 throw new IllegalStateException(e); 129 } 130 } 131 132 /** 133 * Checks that the input {@link Annotation} is a {@link LemmaAnnotation} 134 * 135 * @param annotation 136 * @throws IllegalArgumentException 137 * if the input {@link Annotation} is not a {@link LemmaAnnotation} 138 * 139 */ 140 private static void checkAnnotationType(Annotation annotation) { 141 if (!(annotation instanceof LemmaAnnotation)) { 142 throw new IllegalArgumentException( 143 "Expecting LemmaAnnotation class. Unable to assign lemma information to annotation of type: " 144 + annotation.getClass().getName()); 145 } 146 } 147 148 /** 149 * Transfers information from the input {@link Lemma} to the input {@link LemmaAnnotation}, 150 * specifically the lemmatized string and its accompanying part-of-speech 151 * 152 * @param lemmaAnnot 153 * @param lemma 154 * @throws CASException 155 * if the {@link JCas} is not retrievable from the input {@link LemmaAnnotation} 156 */ 157 private static void addLemma(LemmaAnnotation lemmaAnnot, Lemma lemma) throws CASException { 158 JCas jCas = lemmaAnnot.getCAS().getJCas(); 159 StringArray lemmas = UimaUtil.addToStringArray(lemmaAnnot.getLemmas(), lemma.getLemma(), jCas); 160 PartOfSpeech pos = getPartOfSpeech(lemma, jCas); 161 FSArray partsOfSpeech = UimaUtil.addToFSArray(lemmaAnnot.getPartsOfSpeech(), pos, jCas); 162 lemmaAnnot.setLemmas(lemmas); 163 lemmaAnnot.setPartsOfSpeech(partsOfSpeech); 164 } 165 166 /** 167 * @param lemmaAnnot 168 * @return a List of {@link Lemma} objects that were found in the input {@link LemmaAnnotation} 169 */ 170 private static List<Lemma> extractLemmas(LemmaAnnotation lemmaAnnot) { 171 List<Lemma> lemmasToReturn = new ArrayList<Lemma>(); 172 StringArray lemmas = lemmaAnnot.getLemmas(); 173 FSArray partsOfSpeech = lemmaAnnot.getPartsOfSpeech(); 174 for (int i = 0; i < lemmas.size(); i++) { 175 String lemmaStr = lemmas.get(i); 176 PartOfSpeech pos = (PartOfSpeech) partsOfSpeech.get(i); 177 lemmasToReturn.add(new Lemma(lemmaStr, pos.getPosTag(), pos.getTagSetName())); 178 } 179 return lemmasToReturn; 180 } 181 182 /** 183 * @param lemma 184 * @param jCas 185 * @return a {@link PartOfSpeech} object initialized from the input {@link Lemma} 186 */ 187 private static PartOfSpeech getPartOfSpeech(Lemma lemma, JCas jCas) { 188 PartOfSpeech pos = new PartOfSpeech(jCas); 189 pos.setPosTag(lemma.getPos().getPosTag()); 190 pos.setTagSetName(lemma.getPos().getTagSetName()); 191 return pos; 192 } 193 194 /* 195 * (non-Javadoc) 196 * 197 * @see 198 * edu.ucdenver.ccp.uima.shims.annotation.AnnotationDecorator#extractAttribute(org.apache.uima 199 * .jcas.tcas.Annotation, java.lang.String) 200 */ 201 @Override 202 public List<Lemma> extractAttribute(Annotation annotation, String attributeType) { 203 return extractLemmas(annotation); 204 } 205 206 /* 207 * (non-Javadoc) 208 * 209 * @see 210 * edu.ucdenver.ccp.uima.shims.annotation.syntactic.token.LemmaDecorator#extractLemma(org.apache 211 * .uima.jcas.tcas.Annotation) 212 */ 213 @Override 214 public List<Lemma> extractLemmas(Annotation annotation) { 215 checkAnnotationType(annotation); 216 LemmaAnnotation lemmaAnnot = (LemmaAnnotation) annotation; 217 return extractLemmas(lemmaAnnot); 218 } 219 220 }