001/* 002 * (C) Copyright 2006-2014 Nuxeo SA (http://nuxeo.com/) and others. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 * 016 * Contributors: 017 * Florent Guillaume 018 */ 019package org.nuxeo.ecm.core.storage; 020 021import java.util.Arrays; 022import java.util.Collections; 023import java.util.Iterator; 024import java.util.LinkedList; 025import java.util.List; 026import java.util.Set; 027import java.util.regex.Pattern; 028 029import org.nuxeo.ecm.core.query.QueryParseException; 030 031/** 032 * Structured fulltext query analyzer. 033 */ 034public class FulltextQueryAnalyzer { 035 036 protected static final String SPACE = " "; 037 038 protected static final String PLUS = "+"; 039 040 protected static final String MINUS = "-"; 041 042 protected static final char CSPACE = ' '; 043 044 protected static final String DOUBLE_QUOTES = "\""; 045 046 protected static final String OR = "OR"; 047 048 protected static final Pattern SEPARATOR = Pattern.compile("[ ]"); 049 050 protected static final Pattern IGNORED = Pattern.compile("\\p{Punct}+"); 051 052 /** 053 * Structured fulltext query operator. 054 */ 055 public enum Op { 056 OR, AND, WORD, NOTWORD 057 } 058 059 /** 060 * Structured fulltext query. 061 */ 062 public static class FulltextQuery { 063 064 public Op op; 065 066 /** The list of terms, if op is OR or AND */ 067 public List<FulltextQuery> terms; 068 069 /** The word, if op is WORD or NOTWORD */ 070 public String word; 071 072 /** 073 * Checks if the word is a phrase. 074 */ 075 public boolean isPhrase() { 076 return word != null && word.contains(SPACE); 077 } 078 } 079 080 protected FulltextQuery ft = new FulltextQuery(); 081 082 protected List<FulltextQuery> terms = new LinkedList<FulltextQuery>(); 083 084 protected FulltextQuery analyze(String query) { 085 query = query.replaceAll(" +", " ").trim(); 086 if (query.trim().length() == 0) { 087 return null; 088 } 089 ft.op = Op.OR; 090 ft.terms = new LinkedList<FulltextQuery>(); 091 // current sequence of ANDed terms 092 boolean wasOr = false; 093 String[] words = split(query); 094 for (Iterator<String> it = Arrays.asList(words).iterator(); it.hasNext();) { 095 boolean plus = false; 096 boolean minus = false; 097 String word = it.next(); 098 if (ignored(word)) { 099 continue; 100 } 101 if (word.startsWith(PLUS)) { 102 plus = true; 103 word = word.substring(1); 104 } else if (word.startsWith(MINUS)) { 105 minus = true; 106 word = word.substring(1); 107 } 108 if (word.startsWith(DOUBLE_QUOTES)) { 109 // read phrase 110 word = word.substring(1); 111 StringBuilder phrase = null; 112 while (true) { 113 boolean end = word.endsWith(DOUBLE_QUOTES); 114 if (end) { 115 word = word.substring(0, word.length() - 1).trim(); 116 } 117 if (word.contains(DOUBLE_QUOTES)) { 118 throw new QueryParseException("Invalid fulltext query (double quotes in word): " + query); 119 } 120 if (word.length() != 0) { 121 if (phrase == null) { 122 phrase = new StringBuilder(); 123 } else { 124 phrase.append(CSPACE); 125 } 126 phrase.append(word); 127 } 128 if (end) { 129 break; 130 } 131 if (!it.hasNext()) { 132 throw new QueryParseException("Invalid fulltext query (unterminated phrase): " + query); 133 } 134 word = it.next(); 135 } 136 if (phrase == null) { 137 continue; 138 } 139 word = phrase.toString(); 140 } else if (word.equalsIgnoreCase(OR)) { 141 if (wasOr) { 142 throw new QueryParseException("Invalid fulltext query (OR OR): " + query); 143 } 144 if (terms.isEmpty()) { 145 throw new QueryParseException("Invalid fulltext query (standalone OR): " + query); 146 } 147 wasOr = true; 148 continue; 149 } 150 FulltextQuery w = new FulltextQuery(); 151 if (minus) { 152 if (word.length() == 0) { 153 throw new QueryParseException("Invalid fulltext query (standalone -): " + query); 154 } 155 w.op = Op.NOTWORD; 156 } else { 157 if (plus) { 158 if (word.length() == 0) { 159 throw new QueryParseException("Invalid fulltext query (standalone +): " + query); 160 } 161 } 162 w.op = Op.WORD; 163 } 164 if (wasOr) { 165 endAnd(); 166 wasOr = false; 167 } 168 w.word = word; 169 terms.add(w); 170 } 171 if (wasOr) { 172 throw new QueryParseException("Invalid fulltext query (final OR): " + query); 173 } 174 // final terms 175 endAnd(); 176 int size = ft.terms.size(); 177 if (size == 0) { 178 // all terms were negative 179 return null; 180 } else if (size == 1) { 181 // simplify when no OR 182 ft = ft.terms.get(0); 183 } 184 return ft; 185 } 186 187 protected String[] split(String query) { 188 return SEPARATOR.split(query); 189 } 190 191 protected boolean ignored(String word) { 192 if ("-".equals(word) || "+".equals(word) || word.contains("\"")) { 193 return false; // dealt with later, different error 194 } 195 return IGNORED.matcher(word).matches(); 196 } 197 198 // add current ANDed terms to global OR 199 protected void endAnd() { 200 // put negative words at the end 201 List<FulltextQuery> pos = new LinkedList<FulltextQuery>(); 202 List<FulltextQuery> neg = new LinkedList<FulltextQuery>(); 203 for (FulltextQuery term : terms) { 204 if (term.op == Op.NOTWORD) { 205 neg.add(term); 206 } else { 207 pos.add(term); 208 } 209 } 210 if (!pos.isEmpty()) { 211 terms = pos; 212 terms.addAll(neg); 213 if (terms.size() == 1) { 214 ft.terms.add(terms.get(0)); 215 } else { 216 FulltextQuery a = new FulltextQuery(); 217 a.op = Op.AND; 218 a.terms = terms; 219 ft.terms.add(a); 220 } 221 } 222 terms = new LinkedList<FulltextQuery>(); 223 } 224 225 public static void translate(FulltextQuery ft, StringBuilder buf, String or, String and, String andNot, 226 String wordStart, String wordEnd, Set<Character> wordCharsReserved, String phraseStart, String phraseEnd, 227 boolean quotePhraseWords) { 228 if (ft.op == Op.AND || ft.op == Op.OR) { 229 buf.append('('); 230 for (int i = 0; i < ft.terms.size(); i++) { 231 FulltextQuery term = ft.terms.get(i); 232 if (i > 0) { 233 buf.append(' '); 234 if (ft.op == Op.OR) { 235 buf.append(or); 236 } else { // Op.AND 237 if (term.op == Op.NOTWORD) { 238 buf.append(andNot); 239 } else { 240 buf.append(and); 241 } 242 } 243 buf.append(' '); 244 } 245 translate(term, buf, or, and, andNot, wordStart, wordEnd, wordCharsReserved, phraseStart, phraseEnd, 246 quotePhraseWords); 247 } 248 buf.append(')'); 249 return; 250 } else { 251 String word = ft.word; 252 if (ft.isPhrase()) { 253 if (quotePhraseWords) { 254 boolean first = true; 255 for (String w : word.split(" ")) { 256 if (!first) { 257 buf.append(" "); 258 } 259 first = false; 260 appendWord(w, buf, wordStart, wordEnd, wordCharsReserved); 261 } 262 } else { 263 buf.append(phraseStart); 264 buf.append(word); 265 buf.append(phraseEnd); 266 } 267 } else { 268 appendWord(word, buf, wordStart, wordEnd, wordCharsReserved); 269 } 270 } 271 } 272 273 protected static void appendWord(String word, StringBuilder buf, String start, String end, Set<Character> reserved) { 274 boolean quote = true; 275 if (!reserved.isEmpty()) { 276 for (char c : word.toCharArray()) { 277 if (reserved.contains(Character.valueOf(c))) { 278 quote = false; 279 break; 280 } 281 } 282 } 283 if (quote) { 284 buf.append(start); 285 } 286 buf.append(word); 287 if (quote) { 288 buf.append(end); 289 } 290 } 291 292 public static boolean hasPhrase(FulltextQuery ft) { 293 if (ft.op == Op.AND || ft.op == Op.OR) { 294 for (FulltextQuery term : ft.terms) { 295 if (hasPhrase(term)) { 296 return true; 297 } 298 } 299 return false; 300 } else { 301 return ft.isPhrase(); 302 } 303 } 304 305 /** 306 * Analyzes a fulltext query into a generic datastructure that can be used for each specific database. 307 * <p> 308 * List of terms containing only negative words are suppressed. Otherwise negative words are put at the end of the 309 * lists of terms. 310 */ 311 public static FulltextQuery analyzeFulltextQuery(String query) { 312 return new FulltextQueryAnalyzer().analyze(query); 313 } 314 315 /** 316 * Translate fulltext into a common pattern used by many servers. 317 */ 318 public static String translateFulltext(FulltextQuery ft, String or, String and, String andNot, String phraseQuote) { 319 StringBuilder buf = new StringBuilder(); 320 translate(ft, buf, or, and, andNot, "", "", Collections.<Character> emptySet(), phraseQuote, phraseQuote, false); 321 return buf.toString(); 322 } 323 324 /** 325 * Translate fulltext into a common pattern used by many servers. 326 */ 327 public static String translateFulltext(FulltextQuery ft, String or, String and, String andNot, String wordStart, 328 String wordEnd, Set<Character> wordCharsReserved, String phraseStart, String phraseEnd, 329 boolean quotePhraseWords) { 330 StringBuilder buf = new StringBuilder(); 331 translate(ft, buf, or, and, andNot, wordStart, wordEnd, wordCharsReserved, phraseStart, phraseEnd, 332 quotePhraseWords); 333 return buf.toString(); 334 } 335 336}