001    // =================================================================================================
002    // Copyright 2011 Twitter, Inc.
003    // -------------------------------------------------------------------------------------------------
004    // Licensed under the Apache License, Version 2.0 (the "License");
005    // you may not use this work except in compliance with the License.
006    // You may obtain a copy of the License in the LICENSE file, or at:
007    //
008    //  http://www.apache.org/licenses/LICENSE-2.0
009    //
010    // Unless required by applicable law or agreed to in writing, software
011    // distributed under the License is distributed on an "AS IS" BASIS,
012    // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013    // See the License for the specific language governing permissions and
014    // limitations under the License.
015    // =================================================================================================
016    
017    package com.twitter.common.text.extractor;
018    
019    import java.util.regex.Matcher;
020    import java.util.regex.Pattern;
021    
022    import com.google.common.base.Preconditions;
023    
024    import com.twitter.common.text.token.TokenStream;
025    import com.twitter.common.text.token.attribute.CharSequenceTermAttribute;
026    
027    /**
028     * Extracts entities from text according to a given regular expression.
029     */
030    public class RegexExtractor extends TokenStream {
031      private final CharSequenceTermAttribute charSeqTermAtt =
032        addAttribute(CharSequenceTermAttribute.class);
033    
034      private Pattern regexPattern;
035      private int startGroup = 0;
036      private int endGroup = 0;
037      private char triggeringChar = 0;
038      private Matcher matcher = null;
039    
040      /**
041       * Protected constructor for subclass builders, clients should use a builder to create an
042       * instance.
043       */
044      protected RegexExtractor() { }
045    
046      /**
047       * Sets the regular expression used in this {@code RegexExtractor}.
048       *
049       * @param pattern regular expression defining the entities to be extracted
050       */
051      protected void setRegexPattern(Pattern pattern) {
052        this.regexPattern = pattern;
053      }
054    
055      /**
056       * Sets the regular expression and start/end group ID used in this {@code RegexExtractor}.
057       *
058       * @param pattern Regex pattern of a substring to be replaced.
059       * @param startGroup ID of the group in the pattern that matches the beginning
060       *  of the substring being replaced. Set to 0 to match the entire pattern.
061       * @param endGroup ID of the group in the pattern that matches the end
062       *  of the substring being replace. Set to 0 to match the entire pattern.
063       */
064      protected void setRegexPattern(Pattern pattern, int startGroup, int endGroup) {
065        this.regexPattern = pattern;
066        this.startGroup = startGroup;
067        this.endGroup = endGroup;
068      }
069    
070      /**
071       * Sets a character that must appear in the input text. If a specified character does not appear
072       * in the input text, this {@code RegexExtractor} does not extract entities from the text.
073       * Specifying a {@code triggeringChar} may improve the performance by skipping unnecessary pattern
074       * matching.
075       *
076       * @param triggeringChar a character that must appear in the text
077       */
078      protected void setTriggeringChar(char triggeringChar) {
079        Preconditions.checkNotNull(triggeringChar);
080        this.triggeringChar = triggeringChar;
081      }
082    
083      /**
084       * Reset the extractor to use a new {@code CharSequence} as input.
085       *
086       * @param input {@code CharSequence} from which to extract the entities.
087       */
088      public void reset(CharSequence input) {
089        Preconditions.checkNotNull(input);
090        charSeqTermAtt.setTermBuffer(input);
091    
092        if (triggeringChar > 0) {
093          // triggeringChar is specified.
094          boolean foundTriggeringChar = false;
095          for (int i = 0; i < input.length(); i++) {
096            if (triggeringChar == input.charAt(i)) {
097              foundTriggeringChar = true;
098              break;
099            }
100          }
101          if (!foundTriggeringChar) {
102            // No triggering char found. No extraction performed.
103            matcher = null;
104            return;
105          }
106        }
107    
108        if (regexPattern != null) {
109          matcher = regexPattern.matcher(input);
110        }
111      }
112    
113      @Override
114      public boolean incrementToken() {
115        if (matcher != null && matcher.find()) {
116          int start = matcher.start(startGroup);
117          int end = matcher.end(endGroup);
118    
119          clearAttributes();
120          charSeqTermAtt.setOffset(start);
121          charSeqTermAtt.setLength(end - start);
122    
123          return true;
124        } else {
125          return false;
126        }
127      }
128    
129      public static class Builder extends AbstractBuilder<RegexExtractor, Builder> {
130        public Builder() {
131          super(new RegexExtractor());
132        }
133      }
134    
135      public abstract static class
136          AbstractBuilder<N extends RegexExtractor, T extends AbstractBuilder<N, T>> {
137        private final N extractor;
138    
139        protected AbstractBuilder(N transformer) {
140          this.extractor = Preconditions.checkNotNull(transformer);
141        }
142    
143        @SuppressWarnings("unchecked")
144        protected T self() {
145          return (T) this;
146        }
147    
148        public T setRegexPattern(Pattern pattern) {
149          Preconditions.checkNotNull(pattern);
150          extractor.regexPattern = pattern;
151          return self();
152        }
153    
154        public T setRegexPattern(Pattern pattern, int startGroup, int endGroup) {
155          Preconditions.checkNotNull(pattern);
156          Preconditions.checkArgument(startGroup >= 0);
157          Preconditions.checkArgument(endGroup >= 0);
158          extractor.setRegexPattern(pattern, startGroup, endGroup);
159          return self();
160        }
161    
162        public T setTriggeringChar(char triggeringChar) {
163          Preconditions.checkArgument(triggeringChar > 0);
164          extractor.setTriggeringChar(triggeringChar);
165          return self();
166        }
167    
168        public N build() {
169          return extractor;
170        }
171      }
172    }