001    /*
002     Copyright (c) 2012, Regents of the University of Colorado
003     All rights reserved.
004    
005     Redistribution and use in source and binary forms, with or without modification, 
006     are permitted provided that the following conditions are met:
007    
008     * Redistributions of source code must retain the above copyright notice, this 
009        list of conditions and the following disclaimer.
010       
011     * Redistributions in binary form must reproduce the above copyright notice, 
012        this list of conditions and the following disclaimer in the documentation 
013        and/or other materials provided with the distribution.
014       
015     * Neither the name of the University of Colorado nor the names of its 
016        contributors may be used to endorse or promote products derived from this 
017        software without specific prior written permission.
018    
019     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 
020     ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 
021     WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 
022     DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
023     ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 
024     (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 
025     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 
026     ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
027     (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
028     SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029     */
030    package edu.ucdenver.ccp.nlp.biolemmatizer.uima;
031    
032    import java.lang.reflect.Constructor;
033    import java.lang.reflect.InvocationTargetException;
034    import java.lang.reflect.Method;
035    import java.util.ArrayList;
036    import java.util.Iterator;
037    import java.util.List;
038    
039    import org.apache.uima.UimaContext;
040    import org.apache.uima.analysis_engine.AnalysisEngine;
041    import org.apache.uima.analysis_engine.AnalysisEngineDescription;
042    import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
043    import org.apache.uima.cas.CAS;
044    import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
045    import org.apache.uima.fit.descriptor.ConfigurationParameter;
046    import org.apache.uima.fit.factory.AnalysisEngineFactory;
047    import org.apache.uima.fit.factory.ConfigurationParameterFactory;
048    import org.apache.uima.jcas.JCas;
049    import org.apache.uima.jcas.tcas.Annotation;
050    import org.apache.uima.resource.ResourceInitializationException;
051    import org.apache.uima.resource.metadata.TypeSystemDescription;
052    
053    import edu.ucdenver.ccp.nlp.biolemmatizer.BioLemmatizer;
054    import edu.ucdenver.ccp.nlp.biolemmatizer.LemmataEntry;
055    import edu.ucdenver.ccp.uima.shims.annotation.AnnotationDataExtractor;
056    import edu.ucdenver.ccp.uima.shims.annotation.syntactic.token.LemmaDecorator;
057    import edu.ucdenver.ccp.uima.shims.annotation.syntactic.token.PartOfSpeechDecorator;
058    
059    /**
060     * This annotator processes tokens in the CAS and inserts corresponding lemmas. This annotator is
061     * type-system-independent and relies on implementations of TokenAttributeExtractor,
062     * TokenAttributeInserter, and AnnotationDataExtractor in order to function as intended.
063     * 
064     * @author Colorado Computational Pharmacology, UC Denver; ccpsupport@ucdenver.edu
065     * 
066     */
067    public class BioLemmatizer_AE extends JCasAnnotator_ImplBase {
068    
069            /**
070             * Parameter name used in the UIMA descriptor file for the token type
071             */
072            public static final String PARAM_TOKEN_TYPE_NAME = "tokenTypeName";
073    
074            /**
075             * The token type to use. This parameter serves a dual purpose. It can be used to represent the
076             * class name of the token annotation type to retrieve from the CAS, e.g.
077             * org.apache.uima.examples.tokenizer.Token. When this is the case, annotations of this type
078             * will be processed, and lemmas for the text covered by these annotations will be determined.
079             * The second use for this configuration parameter is when it is used in conjunction with an
080             * {@link AnnotationDataExtractor}. The
081             * {@link AnnotationDataExtractor#getAnnotationType(Annotation)} method returns a {@link String}
082             * and when the value of that returned {@link String} equals the value of the tokenTypeName,
083             * then that annotation will be treated as a token and its covered text will be lemmatized.
084             */
085            @ConfigurationParameter(mandatory = true, description = "")
086            private String tokenTypeName;
087    
088            /**
089             * Parameter name used in the UIMA descriptor file for the name of the method that returns a
090             * part-of-speech (as a String) from the token annotation
091             */
092            public static final String PARAM_TOKEN_GET_POS_METHOD_NAME = "tokenGetPosMethodName";
093    
094            /**
095             * This is an optional parameter. It is used in conjunction with the tokenTypeName when that
096             * parameter represents a class name, i.e. when it represents the name of token annotation
097             * classes in the CAS. The tokenGetPosMethodName should be the name of the method in the
098             * tokenTypeName class that returns the part-of-speech tag. <br>
099             * <br>
100             * If this field is not set, then either the input tokens do not have part-of-speech information
101             * associated with them, or the tokenTypeName configuration parameter is not the name of an
102             * annotation class, but is instead a type as in the second scenario described above.
103             */
104            @ConfigurationParameter(mandatory = false, description = "")
105            private String tokenGetPosMethodName;
106    
107            /**
108             * Parameter name used in the UIMA descriptor file for the {@link PartOfSpeechDecorator}
109             * implementation to use
110             */
111            public static final String PARAM_POS_DECORATOR_CLASS = "posDecoratorClassName";
112    
113            /**
114             * The name of the {@link PartOfSpeechDecorator} implementation to use
115             */
116            @ConfigurationParameter(mandatory = false, description = "name of the PartOfSpeechDecorator implementation to use")
117            private String posDecoratorClassName;
118    
119            /**
120             * The {@link PartOfSpeechDecorator} that will be initialized to the class specified by the
121             * {@link #posDecoratorClassName} configuration parameter
122             */
123            private PartOfSpeechDecorator posDecorator;
124    
125            /**
126             * Parameter name used in the UIMA descriptor file for the {@link LemmaDecorator} implementation
127             * to use
128             */
129            public static final String PARAM_LEMMA_DECORATOR_CLASS = "lemmaDecoratorClassName";
130    
131            /**
132             * The name of the {@link LemmaDecorator} implementation to use
133             */
134            @ConfigurationParameter(mandatory = true, description = "name of the LemmaDecorator implementation to use", defaultValue = "edu.ucdenver.ccp.nlp.biolemmatizer.uima.DefaultLemmaDecorator")
135            private String lemmaDecoratorClassName;
136    
137            /**
138             * The {@link LemmaDecorator} that will be initialized to the class specified by the
139             * {@link #lemmaDecoratorClassName} configuration parameter
140             */
141            private LemmaDecorator lemmaDecorator;
142    
143            /**
144             * Parameter name used in the UIMA descriptor file for the annotation data extractor
145             * implementation to use
146             */
147            public static final String PARAM_ANNOTATION_DATA_EXTRACTOR_CLASS = "annotationDataExtractorClassName";
148    
149            /**
150             * The name of the {@link AnnotationDataExtractor} implementation to use
151             */
152            @ConfigurationParameter(mandatory = true, description = "name of the AnnotationDataExtractor implementation to use", defaultValue = "edu.ucdenver.ccp.uima.shims.annotation.impl.DefaultAnnotationDataExtractor")
153            private String annotationDataExtractorClassName;
154    
155            /**
156             * this {@link AnnotationDataExtractor} will be initialized based on the class name specified by
157             * the annotationDataExtractorClassName parameter
158             */
159            private AnnotationDataExtractor annotationDataExtractor;
160    
161            /**
162             * This {@link BioLemmatizer} will do the bulk of the work in the
163             * {@link BioLemmatizer_AE#process(JCas)} method
164             */
165            private BioLemmatizer bioLemmatizer;
166    
167            /**
168             * Initializes the {@link BioLemmatizer} that will be used by the
169             * {@link BioLemmatizer_AE#process(JCas)} method
170             * 
171             * @see org.uimafit.component.JCasAnnotator_ImplBase#initialize(org.apache.uima.UimaContext)
172             */
173            @Override
174            public void initialize(UimaContext context) throws ResourceInitializationException {
175                    super.initialize(context);
176                    bioLemmatizer = new BioLemmatizer();
177                    lemmaDecorator = (LemmaDecorator) invokeNoArgsConstructor(lemmaDecoratorClassName);
178                    annotationDataExtractor = (AnnotationDataExtractor) invokeNoArgsConstructor(annotationDataExtractorClassName);
179                    if (posDecoratorClassName != null) {
180                            posDecorator = (PartOfSpeechDecorator) invokeNoArgsConstructor(posDecoratorClassName);
181                    }
182            }
183    
184            /**
185             * Returns an instantiation of the class specified by the input {@link String}. Assumes default
186             * constructor, i.e. no arguments.
187             * 
188             * @param className
189             * @param arguments
190             * @return
191             */
192            public static Object invokeNoArgsConstructor(String className) {
193                    try {
194                            Class<?> cls = Class.forName(className);
195                            Constructor<?> constructor = cls.getConstructor();
196                            if (!constructor.isAccessible())
197                                    constructor.setAccessible(true);
198                            return constructor.newInstance();
199                    } catch (ClassNotFoundException e) {
200                            throw new RuntimeException(e);
201                    } catch (SecurityException e) {
202                            throw new RuntimeException(e);
203                    } catch (NoSuchMethodException e) {
204                            throw new RuntimeException(e);
205                    } catch (IllegalArgumentException e) {
206                            throw new RuntimeException(e);
207                    } catch (InstantiationException e) {
208                            throw new RuntimeException(e);
209                    } catch (IllegalAccessException e) {
210                            throw new RuntimeException(e);
211                    } catch (InvocationTargetException e) {
212                            throw new RuntimeException(e);
213                    }
214            }
215    
216            /**
217             * This process(JCas) method cycles through all annotations in the CAS. For those that are
218             * identified as tokens by {@link AnnotationDataExtractor} implementation being used, an attempt
219             * is made to extract part-of-speech information. The covered text for each token is then
220             * lemmatized using the {@link BioLemmatizer}, using the part-of-speech information if it was
221             * available. Results from the {@link BioLemmatizer} are added to the CAS via the specified
222             * {@link LemmaDecorator} implementation.
223             * 
224             * @see org.apache.uima.analysis_component.JCasAnnotator_ImplBase#process(org.apache.uima.jcas.JCas)
225             */
226            @Override
227            public void process(JCas jCas) throws AnalysisEngineProcessException {
228                    for (Iterator<Annotation> annotIter = jCas.getJFSIndexRepository().getAnnotationIndex().iterator(); annotIter
229                                    .hasNext();) {
230                            Annotation annotation = annotIter.next();
231                            String annotationType = annotationDataExtractor.getAnnotationType(annotation);
232                            if (annotationType != null && annotationType.equals(tokenTypeName)) {
233                                    List<edu.ucdenver.ccp.uima.shims.annotation.syntactic.token.PartOfSpeech> posTags = getPartOfSpeechTags(annotation);
234                                    if (posTags == null || posTags.isEmpty()) {
235                                            runBioLemmatizer(annotation, null);
236                                    } else {
237                                            for (edu.ucdenver.ccp.uima.shims.annotation.syntactic.token.PartOfSpeech posTag : posTags) {
238                                                    runBioLemmatizer(annotation, posTag.getPosTag());
239                                            }
240                                    }
241                            }
242                    }
243            }
244    
245            /**
246             * This method uses the {@link BioLemmatizer} to lemmatize the covered text of the input
247             * {@link Annotation}. The lemma is added to the CAS via the {@link LemmaDecorator}
248             * implementation specified in this AE's configuration.
249             * 
250             * @param annotation
251             * @param posTag
252             */
253            private void runBioLemmatizer(Annotation annotation, String posTag) {
254                    String coveredText = annotationDataExtractor.getCoveredText(annotation);
255                    LemmataEntry lemmata = bioLemmatizer.lemmatizeByLexiconAndRules(coveredText, posTag);
256                    Annotation lemmaAnnot = lemmaDecorator.getAnnotationToDecorate(annotation, annotationDataExtractor);
257                    for (edu.ucdenver.ccp.nlp.biolemmatizer.LemmataEntry.Lemma lemma : lemmata.getLemmas()) {
258                            lemmaDecorator.insertLemma(lemmaAnnot,
259                                            new edu.ucdenver.ccp.uima.shims.annotation.syntactic.token.Lemma(lemma.getLemma(),
260                                                            new edu.ucdenver.ccp.uima.shims.annotation.syntactic.token.PartOfSpeech(lemma.getPos(),
261                                                                            lemma.getTagSetName())));
262                    }
263            }
264    
265            /**
266             * This method defaults to using the {@link PartOfSpeechDecorator} instance if there is one
267             * initialized. If not available, it will try to use the getPosMethod specified in the
268             * configuration. If neither are available, it is assumed that there is no input part-of-speech
269             * info and null is returned.
270             * 
271             * @param annotation
272             * 
273             * @return the POS tag as extracted from the input {@link Annotation}
274             */
275            private List<edu.ucdenver.ccp.uima.shims.annotation.syntactic.token.PartOfSpeech> getPartOfSpeechTags(
276                            Annotation annotation) {
277                    if (posDecorator != null) {
278                            return posDecorator.extractPartsOfSpeech(annotation);
279                    }
280                    if (tokenGetPosMethodName != null) {
281                            List<edu.ucdenver.ccp.uima.shims.annotation.syntactic.token.PartOfSpeech> posTagList = new ArrayList<edu.ucdenver.ccp.uima.shims.annotation.syntactic.token.PartOfSpeech>();
282                            posTagList.add(getPosTagUsingSpecifiedMethodName(annotation));
283                            return posTagList;
284                    }
285                    return null;
286            }
287    
288            /**
289             * If the getPosTag method name is specified (and if no PartOfSpeechDecorator is specified) then
290             * this method is used to call the getPosTag method on the input {@link Annotation}.
291             * 
292             * @param annotation
293             * @return the {@link edu.ucdenver.ccp.uima.shims.annotation.syntactic.token.PartOfSpeech}
294             *         extracted from the input {@link Annotation} using the specified
295             *         {@link #tokenGetPosMethodName}.
296             */
297            private edu.ucdenver.ccp.uima.shims.annotation.syntactic.token.PartOfSpeech getPosTagUsingSpecifiedMethodName(
298                            Annotation annotation) {
299                    try {
300                            Method method = annotation.getClass().getDeclaredMethod(tokenGetPosMethodName);
301                            String posTag = method.invoke(annotation).toString();
302                            return new edu.ucdenver.ccp.uima.shims.annotation.syntactic.token.PartOfSpeech(posTag, null);
303                    } catch (NoSuchMethodException e) {
304                            throw new IllegalArgumentException(
305                                            "Error while attempting to retrieve part-of-speech information from class: "
306                                                            + annotation.getClass().getName() + " using method: " + tokenGetPosMethodName + ".", e);
307                    } catch (IllegalAccessException e) {
308                            throw new IllegalArgumentException(
309                                            "Error while attempting to retrieve part-of-speech information from class: "
310                                                            + annotation.getClass().getName() + " using method: " + tokenGetPosMethodName + ".", e);
311                    } catch (InvocationTargetException e) {
312                            throw new IllegalArgumentException(
313                                            "Error while attempting to retrieve part-of-speech information from class: "
314                                                            + annotation.getClass().getName() + " using method: " + tokenGetPosMethodName + ".", e);
315                    }
316            }
317    
318            /**
319             * Initializes an {@link AnalysisEngine} that will determine lemmas for tokens that are present
320             * in the {@link CAS}
321             * 
322             * @param tsd
323             * @param tokenClass
324             * @param tokenGetPosMethodName
325             * @param annotationDataExtractorClass
326             * @param lemmaDecoratorClass
327             * @return
328             * @throws ResourceInitializationException
329             * 
330             */
331            public static AnalysisEngineDescription createAnalysisEngineDescription(TypeSystemDescription tsd,
332                            Class<? extends Annotation> tokenClass, String tokenGetPosMethodName,
333                            Class<? extends AnnotationDataExtractor> annotationDataExtractorClass,
334                            Class<? extends LemmaDecorator> lemmaDecoratorClass) throws ResourceInitializationException {
335                    return AnalysisEngineFactory.createPrimitiveDescription(BioLemmatizer_AE.class, tsd, PARAM_TOKEN_TYPE_NAME,
336                                    tokenClass.getName(), PARAM_TOKEN_GET_POS_METHOD_NAME, tokenGetPosMethodName,
337                                    PARAM_ANNOTATION_DATA_EXTRACTOR_CLASS, annotationDataExtractorClass.getName(),
338                                    PARAM_LEMMA_DECORATOR_CLASS, lemmaDecoratorClass.getName());
339            }
340    
341            /**
342             * @param tsd
343             * @param tokenType
344             * @param partOfSpeechDecoratorClass
345             * @param annotationDataExtractorClass
346             * @param lemmaDecoratorClass
347             * @return
348             * @throws ResourceInitializationException
349             */
350            public static AnalysisEngineDescription createAnalysisEngineDescription(TypeSystemDescription tsd,
351                            String tokenType, Class<? extends PartOfSpeechDecorator> partOfSpeechDecoratorClass,
352                            Class<? extends AnnotationDataExtractor> annotationDataExtractorClass,
353                            Class<? extends LemmaDecorator> lemmaDecoratorClass) throws ResourceInitializationException {
354                    return AnalysisEngineFactory.createPrimitiveDescription(BioLemmatizer_AE.class, tsd, PARAM_TOKEN_TYPE_NAME,
355                                    tokenType, PARAM_POS_DECORATOR_CLASS, partOfSpeechDecoratorClass.getName(),
356                                    PARAM_ANNOTATION_DATA_EXTRACTOR_CLASS, annotationDataExtractorClass.getName(),
357                                    PARAM_LEMMA_DECORATOR_CLASS, lemmaDecoratorClass.getName());
358            }
359    
360            /**
361             * @param tsd
362             * @param tokenClass
363             * @return
364             * @throws ResourceInitializationException
365             */
366            public static AnalysisEngineDescription createAnalysisEngineDescription(TypeSystemDescription tsd,
367                            Class<? extends Annotation> tokenClass) throws ResourceInitializationException {
368                    return AnalysisEngineFactory.createPrimitiveDescription(BioLemmatizer_AE.class, tsd, PARAM_TOKEN_TYPE_NAME,
369                                    tokenClass.getName());
370            }
371    
372            /**
373             * @param tsd
374             * @param tokenClass
375             * @param tokenGetPosMethodName
376             * @return
377             * @throws ResourceInitializationException
378             */
379            public static AnalysisEngineDescription createAnalysisEngineDescription(TypeSystemDescription tsd,
380                            Class<? extends Annotation> tokenClass, String tokenGetPosMethodName)
381                            throws ResourceInitializationException {
382                    return AnalysisEngineFactory.createPrimitiveDescription(BioLemmatizer_AE.class, tsd, PARAM_TOKEN_TYPE_NAME,
383                                    tokenClass.getName(), PARAM_TOKEN_GET_POS_METHOD_NAME, tokenGetPosMethodName);
384            }
385    
386    }