001/* 002 * (C) Copyright 2006-2011 Nuxeo SA (http://nuxeo.com/) and others. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 * 016 * Contributors: 017 * Florent Guillaume 018 */ 019 020package org.nuxeo.common.utils; 021 022import java.util.Arrays; 023import java.util.Collections; 024import java.util.HashSet; 025import java.util.LinkedHashSet; 026import java.util.Set; 027import java.util.regex.Pattern; 028 029/** 030 * Functions related to simple fulltext parsing. They don't try to be exhaustive but they work for simple cases. 031 */ 032public class FullTextUtils { 033 034 public static final Pattern wordPattern = Pattern.compile("[\\s\\p{Punct}]+"); 035 036 public static final int MIN_SIZE = 3; 037 038 public static final String STOP_WORDS = "a an are and as at be by for from how " 039 + "i in is it of on or that the this to was what when where who will with " 040 + "car donc est il ils je la le les mais ni nous or ou pour tu un une vous " + "www com net org"; 041 042 public static final Set<String> stopWords = new HashSet<>(Arrays.asList(StringUtils.split(STOP_WORDS, ' ', false))); 043 044 public static final String UNACCENTED = "aaaaaaaceeeeiiii\u00f0nooooo\u00f7ouuuuy\u00fey"; 045 046 private FullTextUtils() { 047 // utility class 048 } 049 050 /** 051 * Extracts the words from a string for simple fulltext indexing. 052 * <p> 053 * Initial order is kept, but duplicate words are removed. 054 * <p> 055 * It omits short or stop words, removes accents and does pseudo-stemming. 056 * 057 * @param string the string 058 * @param removeDiacritics if the diacritics must be removed 059 * @return an ordered set of resulting words 060 */ 061 public static Set<String> parseFullText(String string, boolean removeDiacritics) { 062 if (string == null) { 063 return Collections.emptySet(); 064 } 065 Set<String> set = new LinkedHashSet<>(); 066 for (String word : wordPattern.split(string)) { 067 String w = parseWord(word, removeDiacritics); 068 if (w != null) { 069 set.add(w); 070 } 071 } 072 return set; 073 } 074 075 /** 076 * Parses a word and returns a simplified lowercase form. 077 * 078 * @param string the word 079 * @param removeDiacritics if the diacritics must be removed 080 * @return the simplified word, or {@code null} if it was removed as a stop word or a short word 081 */ 082 public static String parseWord(String string, boolean removeDiacritics) { 083 int len = string.length(); 084 if (len < MIN_SIZE) { 085 return null; 086 } 087 StringBuilder sb = new StringBuilder(len); 088 for (int i = 0; i < len; i++) { 089 char c = Character.toLowerCase(string.charAt(i)); 090 if (removeDiacritics) { 091 if (c == '\u00e6') { 092 sb.append("ae"); 093 } else if (c >= '\u00e0' && c <= '\u00ff') { 094 sb.append(UNACCENTED.charAt((c) - 0xe0)); 095 } else if (c == '\u0153') { 096 sb.append("oe"); 097 } else { 098 sb.append(c); 099 } 100 } else { 101 sb.append(c); 102 } 103 } 104 // simple heuristic to remove plurals 105 int l = sb.length(); 106 if (l > 3 && sb.charAt(l - 1) == 's') { 107 sb.setLength(l - 1); 108 } 109 String word = sb.toString(); 110 if (stopWords.contains(word)) { 111 return null; 112 } 113 return word; 114 } 115}