001/* 002 * (C) Copyright 2006-2011 Nuxeo SA (http://nuxeo.com/) and contributors. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 * 016 * Contributors: 017 * Florent Guillaume 018 */ 019 020package org.nuxeo.common.utils; 021 022import java.util.Arrays; 023import java.util.Collections; 024import java.util.HashSet; 025import java.util.LinkedHashSet; 026import java.util.Set; 027import java.util.regex.Pattern; 028 029/** 030 * Functions related to simple fulltext parsing. They don't try to be exhaustive but they work for simple cases. 031 */ 032public class FullTextUtils { 033 034 public static final Pattern wordPattern = Pattern.compile("[\\s\\p{Punct}]+"); 035 036 public static final int MIN_SIZE = 3; 037 038 public static final String STOP_WORDS = "a an are and as at be by for from how " 039 + "i in is it of on or that the this to was what when where who will with " 040 + "car donc est il ils je la le les mais ni nous or ou pour tu un une vous " + "www com net org"; 041 042 public static final Set<String> stopWords = new HashSet<String>(Arrays.asList(StringUtils.split(STOP_WORDS, ' ', 043 false))); 044 045 public static final String UNACCENTED = "aaaaaaaceeeeiiii\u00f0nooooo\u00f7ouuuuy\u00fey"; 046 047 private FullTextUtils() { 048 // utility class 049 } 050 051 /** 052 * Extracts the words from a string for simple fulltext indexing. 053 * <p> 054 * Initial order is kept, but duplicate words are removed. 055 * <p> 056 * It omits short or stop words, removes accents and does pseudo-stemming. 057 * 058 * @param string the string 059 * @param removeDiacritics if the diacritics must be removed 060 * @return an ordered set of resulting words 061 */ 062 public static Set<String> parseFullText(String string, boolean removeDiacritics) { 063 if (string == null) { 064 return Collections.emptySet(); 065 } 066 Set<String> set = new LinkedHashSet<String>(); 067 for (String word : wordPattern.split(string)) { 068 String w = parseWord(word, removeDiacritics); 069 if (w != null) { 070 set.add(w); 071 } 072 } 073 return set; 074 } 075 076 /** 077 * Parses a word and returns a simplified lowercase form. 078 * 079 * @param string the word 080 * @param removeDiacritics if the diacritics must be removed 081 * @return the simplified word, or {@code null} if it was removed as a stop word or a short word 082 */ 083 public static String parseWord(String string, boolean removeDiacritics) { 084 int len = string.length(); 085 if (len < MIN_SIZE) { 086 return null; 087 } 088 StringBuilder buf = new StringBuilder(len); 089 for (int i = 0; i < len; i++) { 090 char c = Character.toLowerCase(string.charAt(i)); 091 if (removeDiacritics) { 092 if (c == '\u00e6') { 093 buf.append("ae"); 094 } else if (c >= '\u00e0' && c <= '\u00ff') { 095 buf.append(UNACCENTED.charAt((c) - 0xe0)); 096 } else if (c == '\u0153') { 097 buf.append("oe"); 098 } else { 099 buf.append(c); 100 } 101 } else { 102 buf.append(c); 103 } 104 } 105 // simple heuristic to remove plurals 106 int l = buf.length(); 107 if (l > 3 && buf.charAt(l - 1) == 's') { 108 buf.setLength(l - 1); 109 } 110 String word = buf.toString(); 111 if (stopWords.contains(word)) { 112 return null; 113 } 114 return word; 115 } 116}