001/* 002 * Copyright (c) 2006-2014 Nuxeo SA (http://nuxeo.com/) and others. 003 * 004 * All rights reserved. This program and the accompanying materials 005 * are made available under the terms of the Eclipse Public License v1.0 006 * which accompanies this distribution, and is available at 007 * http://www.eclipse.org/legal/epl-v10.html 008 * 009 * Contributors: 010 * Florent Guillaume 011 */ 012package org.nuxeo.ecm.core.storage; 013 014import java.util.Arrays; 015import java.util.Collections; 016import java.util.Iterator; 017import java.util.LinkedList; 018import java.util.List; 019import java.util.Set; 020import java.util.regex.Pattern; 021 022import org.nuxeo.ecm.core.query.QueryParseException; 023 024/** 025 * Structured fulltext query analyzer. 026 */ 027public class FulltextQueryAnalyzer { 028 029 protected static final String SPACE = " "; 030 031 protected static final String PLUS = "+"; 032 033 protected static final String MINUS = "-"; 034 035 protected static final char CSPACE = ' '; 036 037 protected static final String DOUBLE_QUOTES = "\""; 038 039 protected static final String OR = "OR"; 040 041 protected static final Pattern SEPARATOR = Pattern.compile("[ ]"); 042 043 protected static final Pattern IGNORED = Pattern.compile("\\p{Punct}+"); 044 045 /** 046 * Structured fulltext query operator. 047 */ 048 public enum Op { 049 OR, AND, WORD, NOTWORD 050 } 051 052 /** 053 * Structured fulltext query. 054 */ 055 public static class FulltextQuery { 056 057 public Op op; 058 059 /** The list of terms, if op is OR or AND */ 060 public List<FulltextQuery> terms; 061 062 /** The word, if op is WORD or NOTWORD */ 063 public String word; 064 065 /** 066 * Checks if the word is a phrase. 067 */ 068 public boolean isPhrase() { 069 return word != null && word.contains(SPACE); 070 } 071 } 072 073 protected FulltextQuery ft = new FulltextQuery(); 074 075 protected List<FulltextQuery> terms = new LinkedList<FulltextQuery>(); 076 077 protected FulltextQuery analyze(String query) { 078 query = query.replaceAll(" +", " ").trim(); 079 if (query.trim().length() == 0) { 080 return null; 081 } 082 ft.op = Op.OR; 083 ft.terms = new LinkedList<FulltextQuery>(); 084 // current sequence of ANDed terms 085 boolean wasOr = false; 086 String[] words = split(query); 087 for (Iterator<String> it = Arrays.asList(words).iterator(); it.hasNext();) { 088 boolean plus = false; 089 boolean minus = false; 090 String word = it.next(); 091 if (ignored(word)) { 092 continue; 093 } 094 if (word.startsWith(PLUS)) { 095 plus = true; 096 word = word.substring(1); 097 } else if (word.startsWith(MINUS)) { 098 minus = true; 099 word = word.substring(1); 100 } 101 if (word.startsWith(DOUBLE_QUOTES)) { 102 // read phrase 103 word = word.substring(1); 104 StringBuilder phrase = null; 105 while (true) { 106 boolean end = word.endsWith(DOUBLE_QUOTES); 107 if (end) { 108 word = word.substring(0, word.length() - 1).trim(); 109 } 110 if (word.contains(DOUBLE_QUOTES)) { 111 throw new QueryParseException("Invalid fulltext query (double quotes in word): " + query); 112 } 113 if (word.length() != 0) { 114 if (phrase == null) { 115 phrase = new StringBuilder(); 116 } else { 117 phrase.append(CSPACE); 118 } 119 phrase.append(word); 120 } 121 if (end) { 122 break; 123 } 124 if (!it.hasNext()) { 125 throw new QueryParseException("Invalid fulltext query (unterminated phrase): " + query); 126 } 127 word = it.next(); 128 } 129 if (phrase == null) { 130 continue; 131 } 132 word = phrase.toString(); 133 } else if (word.equalsIgnoreCase(OR)) { 134 if (wasOr) { 135 throw new QueryParseException("Invalid fulltext query (OR OR): " + query); 136 } 137 if (terms.isEmpty()) { 138 throw new QueryParseException("Invalid fulltext query (standalone OR): " + query); 139 } 140 wasOr = true; 141 continue; 142 } 143 FulltextQuery w = new FulltextQuery(); 144 if (minus) { 145 if (word.length() == 0) { 146 throw new QueryParseException("Invalid fulltext query (standalone -): " + query); 147 } 148 w.op = Op.NOTWORD; 149 } else { 150 if (plus) { 151 if (word.length() == 0) { 152 throw new QueryParseException("Invalid fulltext query (standalone +): " + query); 153 } 154 } 155 w.op = Op.WORD; 156 } 157 if (wasOr) { 158 endAnd(); 159 wasOr = false; 160 } 161 w.word = word; 162 terms.add(w); 163 } 164 if (wasOr) { 165 throw new QueryParseException("Invalid fulltext query (final OR): " + query); 166 } 167 // final terms 168 endAnd(); 169 int size = ft.terms.size(); 170 if (size == 0) { 171 // all terms were negative 172 return null; 173 } else if (size == 1) { 174 // simplify when no OR 175 ft = ft.terms.get(0); 176 } 177 return ft; 178 } 179 180 protected String[] split(String query) { 181 return SEPARATOR.split(query); 182 } 183 184 protected boolean ignored(String word) { 185 if ("-".equals(word) || "+".equals(word) || word.contains("\"")) { 186 return false; // dealt with later, different error 187 } 188 return IGNORED.matcher(word).matches(); 189 } 190 191 // add current ANDed terms to global OR 192 protected void endAnd() { 193 // put negative words at the end 194 List<FulltextQuery> pos = new LinkedList<FulltextQuery>(); 195 List<FulltextQuery> neg = new LinkedList<FulltextQuery>(); 196 for (FulltextQuery term : terms) { 197 if (term.op == Op.NOTWORD) { 198 neg.add(term); 199 } else { 200 pos.add(term); 201 } 202 } 203 if (!pos.isEmpty()) { 204 terms = pos; 205 terms.addAll(neg); 206 if (terms.size() == 1) { 207 ft.terms.add(terms.get(0)); 208 } else { 209 FulltextQuery a = new FulltextQuery(); 210 a.op = Op.AND; 211 a.terms = terms; 212 ft.terms.add(a); 213 } 214 } 215 terms = new LinkedList<FulltextQuery>(); 216 } 217 218 public static void translate(FulltextQuery ft, StringBuilder buf, String or, String and, String andNot, 219 String wordStart, String wordEnd, Set<Character> wordCharsReserved, String phraseStart, String phraseEnd, 220 boolean quotePhraseWords) { 221 if (ft.op == Op.AND || ft.op == Op.OR) { 222 buf.append('('); 223 for (int i = 0; i < ft.terms.size(); i++) { 224 FulltextQuery term = ft.terms.get(i); 225 if (i > 0) { 226 buf.append(' '); 227 if (ft.op == Op.OR) { 228 buf.append(or); 229 } else { // Op.AND 230 if (term.op == Op.NOTWORD) { 231 buf.append(andNot); 232 } else { 233 buf.append(and); 234 } 235 } 236 buf.append(' '); 237 } 238 translate(term, buf, or, and, andNot, wordStart, wordEnd, wordCharsReserved, phraseStart, phraseEnd, 239 quotePhraseWords); 240 } 241 buf.append(')'); 242 return; 243 } else { 244 String word = ft.word; 245 if (ft.isPhrase()) { 246 if (quotePhraseWords) { 247 boolean first = true; 248 for (String w : word.split(" ")) { 249 if (!first) { 250 buf.append(" "); 251 } 252 first = false; 253 appendWord(w, buf, wordStart, wordEnd, wordCharsReserved); 254 } 255 } else { 256 buf.append(phraseStart); 257 buf.append(word); 258 buf.append(phraseEnd); 259 } 260 } else { 261 appendWord(word, buf, wordStart, wordEnd, wordCharsReserved); 262 } 263 } 264 } 265 266 protected static void appendWord(String word, StringBuilder buf, String start, String end, Set<Character> reserved) { 267 boolean quote = true; 268 if (!reserved.isEmpty()) { 269 for (char c : word.toCharArray()) { 270 if (reserved.contains(Character.valueOf(c))) { 271 quote = false; 272 break; 273 } 274 } 275 } 276 if (quote) { 277 buf.append(start); 278 } 279 buf.append(word); 280 if (quote) { 281 buf.append(end); 282 } 283 } 284 285 public static boolean hasPhrase(FulltextQuery ft) { 286 if (ft.op == Op.AND || ft.op == Op.OR) { 287 for (FulltextQuery term : ft.terms) { 288 if (hasPhrase(term)) { 289 return true; 290 } 291 } 292 return false; 293 } else { 294 return ft.isPhrase(); 295 } 296 } 297 298 /** 299 * Analyzes a fulltext query into a generic datastructure that can be used for each specific database. 300 * <p> 301 * List of terms containing only negative words are suppressed. Otherwise negative words are put at the end of the 302 * lists of terms. 303 */ 304 public static FulltextQuery analyzeFulltextQuery(String query) { 305 return new FulltextQueryAnalyzer().analyze(query); 306 } 307 308 /** 309 * Translate fulltext into a common pattern used by many servers. 310 */ 311 public static String translateFulltext(FulltextQuery ft, String or, String and, String andNot, String phraseQuote) { 312 StringBuilder buf = new StringBuilder(); 313 translate(ft, buf, or, and, andNot, "", "", Collections.<Character> emptySet(), phraseQuote, phraseQuote, false); 314 return buf.toString(); 315 } 316 317 /** 318 * Translate fulltext into a common pattern used by many servers. 319 */ 320 public static String translateFulltext(FulltextQuery ft, String or, String and, String andNot, String wordStart, 321 String wordEnd, Set<Character> wordCharsReserved, String phraseStart, String phraseEnd, 322 boolean quotePhraseWords) { 323 StringBuilder buf = new StringBuilder(); 324 translate(ft, buf, or, and, andNot, wordStart, wordEnd, wordCharsReserved, phraseStart, phraseEnd, 325 quotePhraseWords); 326 return buf.toString(); 327 } 328 329}