001 /* 002 Copyright (c) 2012, Regents of the University of Colorado 003 All rights reserved. 004 005 Redistribution and use in source and binary forms, with or without modification, 006 are permitted provided that the following conditions are met: 007 008 * Redistributions of source code must retain the above copyright notice, this 009 list of conditions and the following disclaimer. 010 011 * Redistributions in binary form must reproduce the above copyright notice, 012 this list of conditions and the following disclaimer in the documentation 013 and/or other materials provided with the distribution. 014 015 * Neither the name of the University of Colorado nor the names of its 016 contributors may be used to endorse or promote products derived from this 017 software without specific prior written permission. 018 019 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 020 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 021 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 022 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 023 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 024 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 026 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 027 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 028 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 029 */ 030 package edu.ucdenver.ccp.nlp.biolemmatizer.uima; 031 032 import java.lang.reflect.Constructor; 033 import java.lang.reflect.InvocationTargetException; 034 import java.lang.reflect.Method; 035 import java.util.ArrayList; 036 import java.util.Iterator; 037 import java.util.List; 038 039 import org.apache.uima.UimaContext; 040 import org.apache.uima.analysis_engine.AnalysisEngine; 041 import org.apache.uima.analysis_engine.AnalysisEngineDescription; 042 import org.apache.uima.analysis_engine.AnalysisEngineProcessException; 043 import org.apache.uima.cas.CAS; 044 import org.apache.uima.fit.component.JCasAnnotator_ImplBase; 045 import org.apache.uima.fit.descriptor.ConfigurationParameter; 046 import org.apache.uima.fit.factory.AnalysisEngineFactory; 047 import org.apache.uima.fit.factory.ConfigurationParameterFactory; 048 import org.apache.uima.jcas.JCas; 049 import org.apache.uima.jcas.tcas.Annotation; 050 import org.apache.uima.resource.ResourceInitializationException; 051 import org.apache.uima.resource.metadata.TypeSystemDescription; 052 053 import edu.ucdenver.ccp.nlp.biolemmatizer.BioLemmatizer; 054 import edu.ucdenver.ccp.nlp.biolemmatizer.LemmataEntry; 055 import edu.ucdenver.ccp.uima.shims.annotation.AnnotationDataExtractor; 056 import edu.ucdenver.ccp.uima.shims.annotation.syntactic.token.LemmaDecorator; 057 import edu.ucdenver.ccp.uima.shims.annotation.syntactic.token.PartOfSpeechDecorator; 058 059 /** 060 * This annotator processes tokens in the CAS and inserts corresponding lemmas. This annotator is 061 * type-system-independent and relies on implementations of TokenAttributeExtractor, 062 * TokenAttributeInserter, and AnnotationDataExtractor in order to function as intended. 063 * 064 * @author Colorado Computational Pharmacology, UC Denver; ccpsupport@ucdenver.edu 065 * 066 */ 067 public class BioLemmatizer_AE extends JCasAnnotator_ImplBase { 068 069 /** 070 * Parameter name used in the UIMA descriptor file for the token type 071 */ 072 public static final String PARAM_TOKEN_TYPE_NAME = "tokenTypeName"; 073 074 /** 075 * The token type to use. This parameter serves a dual purpose. It can be used to represent the 076 * class name of the token annotation type to retrieve from the CAS, e.g. 077 * org.apache.uima.examples.tokenizer.Token. When this is the case, annotations of this type 078 * will be processed, and lemmas for the text covered by these annotations will be determined. 079 * The second use for this configuration parameter is when it is used in conjunction with an 080 * {@link AnnotationDataExtractor}. The 081 * {@link AnnotationDataExtractor#getAnnotationType(Annotation)} method returns a {@link String} 082 * and when the value of that returned {@link String} equals the value of the tokenTypeName, 083 * then that annotation will be treated as a token and its covered text will be lemmatized. 084 */ 085 @ConfigurationParameter(mandatory = true, description = "") 086 private String tokenTypeName; 087 088 /** 089 * Parameter name used in the UIMA descriptor file for the name of the method that returns a 090 * part-of-speech (as a String) from the token annotation 091 */ 092 public static final String PARAM_TOKEN_GET_POS_METHOD_NAME = "tokenGetPosMethodName"; 093 094 /** 095 * This is an optional parameter. It is used in conjunction with the tokenTypeName when that 096 * parameter represents a class name, i.e. when it represents the name of token annotation 097 * classes in the CAS. The tokenGetPosMethodName should be the name of the method in the 098 * tokenTypeName class that returns the part-of-speech tag. <br> 099 * <br> 100 * If this field is not set, then either the input tokens do not have part-of-speech information 101 * associated with them, or the tokenTypeName configuration parameter is not the name of an 102 * annotation class, but is instead a type as in the second scenario described above. 103 */ 104 @ConfigurationParameter(mandatory = false, description = "") 105 private String tokenGetPosMethodName; 106 107 /** 108 * Parameter name used in the UIMA descriptor file for the {@link PartOfSpeechDecorator} 109 * implementation to use 110 */ 111 public static final String PARAM_POS_DECORATOR_CLASS = "posDecoratorClassName"; 112 113 /** 114 * The name of the {@link PartOfSpeechDecorator} implementation to use 115 */ 116 @ConfigurationParameter(mandatory = false, description = "name of the PartOfSpeechDecorator implementation to use") 117 private String posDecoratorClassName; 118 119 /** 120 * The {@link PartOfSpeechDecorator} that will be initialized to the class specified by the 121 * {@link #posDecoratorClassName} configuration parameter 122 */ 123 private PartOfSpeechDecorator posDecorator; 124 125 /** 126 * Parameter name used in the UIMA descriptor file for the {@link LemmaDecorator} implementation 127 * to use 128 */ 129 public static final String PARAM_LEMMA_DECORATOR_CLASS = "lemmaDecoratorClassName"; 130 131 /** 132 * The name of the {@link LemmaDecorator} implementation to use 133 */ 134 @ConfigurationParameter(mandatory = true, description = "name of the LemmaDecorator implementation to use", defaultValue = "edu.ucdenver.ccp.nlp.biolemmatizer.uima.DefaultLemmaDecorator") 135 private String lemmaDecoratorClassName; 136 137 /** 138 * The {@link LemmaDecorator} that will be initialized to the class specified by the 139 * {@link #lemmaDecoratorClassName} configuration parameter 140 */ 141 private LemmaDecorator lemmaDecorator; 142 143 /** 144 * Parameter name used in the UIMA descriptor file for the annotation data extractor 145 * implementation to use 146 */ 147 public static final String PARAM_ANNOTATION_DATA_EXTRACTOR_CLASS = "annotationDataExtractorClassName"; 148 149 /** 150 * The name of the {@link AnnotationDataExtractor} implementation to use 151 */ 152 @ConfigurationParameter(mandatory = true, description = "name of the AnnotationDataExtractor implementation to use", defaultValue = "edu.ucdenver.ccp.uima.shims.annotation.impl.DefaultAnnotationDataExtractor") 153 private String annotationDataExtractorClassName; 154 155 /** 156 * this {@link AnnotationDataExtractor} will be initialized based on the class name specified by 157 * the annotationDataExtractorClassName parameter 158 */ 159 private AnnotationDataExtractor annotationDataExtractor; 160 161 /** 162 * This {@link BioLemmatizer} will do the bulk of the work in the 163 * {@link BioLemmatizer_AE#process(JCas)} method 164 */ 165 private BioLemmatizer bioLemmatizer; 166 167 /** 168 * Initializes the {@link BioLemmatizer} that will be used by the 169 * {@link BioLemmatizer_AE#process(JCas)} method 170 * 171 * @see org.uimafit.component.JCasAnnotator_ImplBase#initialize(org.apache.uima.UimaContext) 172 */ 173 @Override 174 public void initialize(UimaContext context) throws ResourceInitializationException { 175 super.initialize(context); 176 bioLemmatizer = new BioLemmatizer(); 177 lemmaDecorator = (LemmaDecorator) invokeNoArgsConstructor(lemmaDecoratorClassName); 178 annotationDataExtractor = (AnnotationDataExtractor) invokeNoArgsConstructor(annotationDataExtractorClassName); 179 if (posDecoratorClassName != null) { 180 posDecorator = (PartOfSpeechDecorator) invokeNoArgsConstructor(posDecoratorClassName); 181 } 182 } 183 184 /** 185 * Returns an instantiation of the class specified by the input {@link String}. Assumes default 186 * constructor, i.e. no arguments. 187 * 188 * @param className 189 * @param arguments 190 * @return 191 */ 192 public static Object invokeNoArgsConstructor(String className) { 193 try { 194 Class<?> cls = Class.forName(className); 195 Constructor<?> constructor = cls.getConstructor(); 196 if (!constructor.isAccessible()) 197 constructor.setAccessible(true); 198 return constructor.newInstance(); 199 } catch (ClassNotFoundException e) { 200 throw new RuntimeException(e); 201 } catch (SecurityException e) { 202 throw new RuntimeException(e); 203 } catch (NoSuchMethodException e) { 204 throw new RuntimeException(e); 205 } catch (IllegalArgumentException e) { 206 throw new RuntimeException(e); 207 } catch (InstantiationException e) { 208 throw new RuntimeException(e); 209 } catch (IllegalAccessException e) { 210 throw new RuntimeException(e); 211 } catch (InvocationTargetException e) { 212 throw new RuntimeException(e); 213 } 214 } 215 216 /** 217 * This process(JCas) method cycles through all annotations in the CAS. For those that are 218 * identified as tokens by {@link AnnotationDataExtractor} implementation being used, an attempt 219 * is made to extract part-of-speech information. The covered text for each token is then 220 * lemmatized using the {@link BioLemmatizer}, using the part-of-speech information if it was 221 * available. Results from the {@link BioLemmatizer} are added to the CAS via the specified 222 * {@link LemmaDecorator} implementation. 223 * 224 * @see org.apache.uima.analysis_component.JCasAnnotator_ImplBase#process(org.apache.uima.jcas.JCas) 225 */ 226 @Override 227 public void process(JCas jCas) throws AnalysisEngineProcessException { 228 for (Iterator<Annotation> annotIter = jCas.getJFSIndexRepository().getAnnotationIndex().iterator(); annotIter 229 .hasNext();) { 230 Annotation annotation = annotIter.next(); 231 String annotationType = annotationDataExtractor.getAnnotationType(annotation); 232 if (annotationType != null && annotationType.equals(tokenTypeName)) { 233 List<edu.ucdenver.ccp.uima.shims.annotation.syntactic.token.PartOfSpeech> posTags = getPartOfSpeechTags(annotation); 234 if (posTags == null || posTags.isEmpty()) { 235 runBioLemmatizer(annotation, null); 236 } else { 237 for (edu.ucdenver.ccp.uima.shims.annotation.syntactic.token.PartOfSpeech posTag : posTags) { 238 runBioLemmatizer(annotation, posTag.getPosTag()); 239 } 240 } 241 } 242 } 243 } 244 245 /** 246 * This method uses the {@link BioLemmatizer} to lemmatize the covered text of the input 247 * {@link Annotation}. The lemma is added to the CAS via the {@link LemmaDecorator} 248 * implementation specified in this AE's configuration. 249 * 250 * @param annotation 251 * @param posTag 252 */ 253 private void runBioLemmatizer(Annotation annotation, String posTag) { 254 String coveredText = annotationDataExtractor.getCoveredText(annotation); 255 LemmataEntry lemmata = bioLemmatizer.lemmatizeByLexiconAndRules(coveredText, posTag); 256 Annotation lemmaAnnot = lemmaDecorator.getAnnotationToDecorate(annotation, annotationDataExtractor); 257 for (edu.ucdenver.ccp.nlp.biolemmatizer.LemmataEntry.Lemma lemma : lemmata.getLemmas()) { 258 lemmaDecorator.insertLemma(lemmaAnnot, 259 new edu.ucdenver.ccp.uima.shims.annotation.syntactic.token.Lemma(lemma.getLemma(), 260 new edu.ucdenver.ccp.uima.shims.annotation.syntactic.token.PartOfSpeech(lemma.getPos(), 261 lemma.getTagSetName()))); 262 } 263 } 264 265 /** 266 * This method defaults to using the {@link PartOfSpeechDecorator} instance if there is one 267 * initialized. If not available, it will try to use the getPosMethod specified in the 268 * configuration. If neither are available, it is assumed that there is no input part-of-speech 269 * info and null is returned. 270 * 271 * @param annotation 272 * 273 * @return the POS tag as extracted from the input {@link Annotation} 274 */ 275 private List<edu.ucdenver.ccp.uima.shims.annotation.syntactic.token.PartOfSpeech> getPartOfSpeechTags( 276 Annotation annotation) { 277 if (posDecorator != null) { 278 return posDecorator.extractPartsOfSpeech(annotation); 279 } 280 if (tokenGetPosMethodName != null) { 281 List<edu.ucdenver.ccp.uima.shims.annotation.syntactic.token.PartOfSpeech> posTagList = new ArrayList<edu.ucdenver.ccp.uima.shims.annotation.syntactic.token.PartOfSpeech>(); 282 posTagList.add(getPosTagUsingSpecifiedMethodName(annotation)); 283 return posTagList; 284 } 285 return null; 286 } 287 288 /** 289 * If the getPosTag method name is specified (and if no PartOfSpeechDecorator is specified) then 290 * this method is used to call the getPosTag method on the input {@link Annotation}. 291 * 292 * @param annotation 293 * @return the {@link edu.ucdenver.ccp.uima.shims.annotation.syntactic.token.PartOfSpeech} 294 * extracted from the input {@link Annotation} using the specified 295 * {@link #tokenGetPosMethodName}. 296 */ 297 private edu.ucdenver.ccp.uima.shims.annotation.syntactic.token.PartOfSpeech getPosTagUsingSpecifiedMethodName( 298 Annotation annotation) { 299 try { 300 Method method = annotation.getClass().getDeclaredMethod(tokenGetPosMethodName); 301 String posTag = method.invoke(annotation).toString(); 302 return new edu.ucdenver.ccp.uima.shims.annotation.syntactic.token.PartOfSpeech(posTag, null); 303 } catch (NoSuchMethodException e) { 304 throw new IllegalArgumentException( 305 "Error while attempting to retrieve part-of-speech information from class: " 306 + annotation.getClass().getName() + " using method: " + tokenGetPosMethodName + ".", e); 307 } catch (IllegalAccessException e) { 308 throw new IllegalArgumentException( 309 "Error while attempting to retrieve part-of-speech information from class: " 310 + annotation.getClass().getName() + " using method: " + tokenGetPosMethodName + ".", e); 311 } catch (InvocationTargetException e) { 312 throw new IllegalArgumentException( 313 "Error while attempting to retrieve part-of-speech information from class: " 314 + annotation.getClass().getName() + " using method: " + tokenGetPosMethodName + ".", e); 315 } 316 } 317 318 /** 319 * Initializes an {@link AnalysisEngine} that will determine lemmas for tokens that are present 320 * in the {@link CAS} 321 * 322 * @param tsd 323 * @param tokenClass 324 * @param tokenGetPosMethodName 325 * @param annotationDataExtractorClass 326 * @param lemmaDecoratorClass 327 * @return 328 * @throws ResourceInitializationException 329 * 330 */ 331 public static AnalysisEngineDescription createAnalysisEngineDescription(TypeSystemDescription tsd, 332 Class<? extends Annotation> tokenClass, String tokenGetPosMethodName, 333 Class<? extends AnnotationDataExtractor> annotationDataExtractorClass, 334 Class<? extends LemmaDecorator> lemmaDecoratorClass) throws ResourceInitializationException { 335 return AnalysisEngineFactory.createPrimitiveDescription(BioLemmatizer_AE.class, tsd, PARAM_TOKEN_TYPE_NAME, 336 tokenClass.getName(), PARAM_TOKEN_GET_POS_METHOD_NAME, tokenGetPosMethodName, 337 PARAM_ANNOTATION_DATA_EXTRACTOR_CLASS, annotationDataExtractorClass.getName(), 338 PARAM_LEMMA_DECORATOR_CLASS, lemmaDecoratorClass.getName()); 339 } 340 341 /** 342 * @param tsd 343 * @param tokenType 344 * @param partOfSpeechDecoratorClass 345 * @param annotationDataExtractorClass 346 * @param lemmaDecoratorClass 347 * @return 348 * @throws ResourceInitializationException 349 */ 350 public static AnalysisEngineDescription createAnalysisEngineDescription(TypeSystemDescription tsd, 351 String tokenType, Class<? extends PartOfSpeechDecorator> partOfSpeechDecoratorClass, 352 Class<? extends AnnotationDataExtractor> annotationDataExtractorClass, 353 Class<? extends LemmaDecorator> lemmaDecoratorClass) throws ResourceInitializationException { 354 return AnalysisEngineFactory.createPrimitiveDescription(BioLemmatizer_AE.class, tsd, PARAM_TOKEN_TYPE_NAME, 355 tokenType, PARAM_POS_DECORATOR_CLASS, partOfSpeechDecoratorClass.getName(), 356 PARAM_ANNOTATION_DATA_EXTRACTOR_CLASS, annotationDataExtractorClass.getName(), 357 PARAM_LEMMA_DECORATOR_CLASS, lemmaDecoratorClass.getName()); 358 } 359 360 /** 361 * @param tsd 362 * @param tokenClass 363 * @return 364 * @throws ResourceInitializationException 365 */ 366 public static AnalysisEngineDescription createAnalysisEngineDescription(TypeSystemDescription tsd, 367 Class<? extends Annotation> tokenClass) throws ResourceInitializationException { 368 return AnalysisEngineFactory.createPrimitiveDescription(BioLemmatizer_AE.class, tsd, PARAM_TOKEN_TYPE_NAME, 369 tokenClass.getName()); 370 } 371 372 /** 373 * @param tsd 374 * @param tokenClass 375 * @param tokenGetPosMethodName 376 * @return 377 * @throws ResourceInitializationException 378 */ 379 public static AnalysisEngineDescription createAnalysisEngineDescription(TypeSystemDescription tsd, 380 Class<? extends Annotation> tokenClass, String tokenGetPosMethodName) 381 throws ResourceInitializationException { 382 return AnalysisEngineFactory.createPrimitiveDescription(BioLemmatizer_AE.class, tsd, PARAM_TOKEN_TYPE_NAME, 383 tokenClass.getName(), PARAM_TOKEN_GET_POS_METHOD_NAME, tokenGetPosMethodName); 384 } 385 386 }