001/*
002 * (C) Copyright 2006-2011 Nuxeo SA (http://nuxeo.com/) and contributors.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 * 
016 * Contributors:
017 *     Florent Guillaume
018 */
019
020package org.nuxeo.common.utils;
021
022import java.util.Arrays;
023import java.util.Collections;
024import java.util.HashSet;
025import java.util.LinkedHashSet;
026import java.util.Set;
027import java.util.regex.Pattern;
028
029/**
030 * Functions related to simple fulltext parsing. They don't try to be exhaustive but they work for simple cases.
031 */
032public class FullTextUtils {
033
034    public static final Pattern wordPattern = Pattern.compile("[\\s\\p{Punct}]+");
035
036    public static final int MIN_SIZE = 3;
037
038    public static final String STOP_WORDS = "a an are and as at be by for from how "
039            + "i in is it of on or that the this to was what when where who will with "
040            + "car donc est il ils je la le les mais ni nous or ou pour tu un une vous " + "www com net org";
041
042    public static final Set<String> stopWords = new HashSet<String>(Arrays.asList(StringUtils.split(STOP_WORDS, ' ',
043            false)));
044
045    public static final String UNACCENTED = "aaaaaaaceeeeiiii\u00f0nooooo\u00f7ouuuuy\u00fey";
046
047    private FullTextUtils() {
048        // utility class
049    }
050
051    /**
052     * Extracts the words from a string for simple fulltext indexing.
053     * <p>
054     * Initial order is kept, but duplicate words are removed.
055     * <p>
056     * It omits short or stop words, removes accents and does pseudo-stemming.
057     *
058     * @param string the string
059     * @param removeDiacritics if the diacritics must be removed
060     * @return an ordered set of resulting words
061     */
062    public static Set<String> parseFullText(String string, boolean removeDiacritics) {
063        if (string == null) {
064            return Collections.emptySet();
065        }
066        Set<String> set = new LinkedHashSet<String>();
067        for (String word : wordPattern.split(string)) {
068            String w = parseWord(word, removeDiacritics);
069            if (w != null) {
070                set.add(w);
071            }
072        }
073        return set;
074    }
075
076    /**
077     * Parses a word and returns a simplified lowercase form.
078     *
079     * @param string the word
080     * @param removeDiacritics if the diacritics must be removed
081     * @return the simplified word, or {@code null} if it was removed as a stop word or a short word
082     */
083    public static String parseWord(String string, boolean removeDiacritics) {
084        int len = string.length();
085        if (len < MIN_SIZE) {
086            return null;
087        }
088        StringBuilder buf = new StringBuilder(len);
089        for (int i = 0; i < len; i++) {
090            char c = Character.toLowerCase(string.charAt(i));
091            if (removeDiacritics) {
092                if (c == '\u00e6') {
093                    buf.append("ae");
094                } else if (c >= '\u00e0' && c <= '\u00ff') {
095                    buf.append(UNACCENTED.charAt((c) - 0xe0));
096                } else if (c == '\u0153') {
097                    buf.append("oe");
098                } else {
099                    buf.append(c);
100                }
101            } else {
102                buf.append(c);
103            }
104        }
105        // simple heuristic to remove plurals
106        int l = buf.length();
107        if (l > 3 && buf.charAt(l - 1) == 's') {
108            buf.setLength(l - 1);
109        }
110        String word = buf.toString();
111        if (stopWords.contains(word)) {
112            return null;
113        }
114        return word;
115    }
116}