001/*
002 * Copyright (c) 2006-2014 Nuxeo SA (http://nuxeo.com/) and others.
003 *
004 * All rights reserved. This program and the accompanying materials
005 * are made available under the terms of the Eclipse Public License v1.0
006 * which accompanies this distribution, and is available at
007 * http://www.eclipse.org/legal/epl-v10.html
008 *
009 * Contributors:
010 *     Florent Guillaume
011 */
012package org.nuxeo.ecm.core.storage;
013
014import java.util.Arrays;
015import java.util.Collections;
016import java.util.Iterator;
017import java.util.LinkedList;
018import java.util.List;
019import java.util.Set;
020import java.util.regex.Pattern;
021
022import org.nuxeo.ecm.core.query.QueryParseException;
023
024/**
025 * Structured fulltext query analyzer.
026 */
027public class FulltextQueryAnalyzer {
028
029    protected static final String SPACE = " ";
030
031    protected static final String PLUS = "+";
032
033    protected static final String MINUS = "-";
034
035    protected static final char CSPACE = ' ';
036
037    protected static final String DOUBLE_QUOTES = "\"";
038
039    protected static final String OR = "OR";
040
041    protected static final Pattern SEPARATOR = Pattern.compile("[ ]");
042
043    protected static final Pattern IGNORED = Pattern.compile("\\p{Punct}+");
044
045    /**
046     * Structured fulltext query operator.
047     */
048    public enum Op {
049        OR, AND, WORD, NOTWORD
050    }
051
052    /**
053     * Structured fulltext query.
054     */
055    public static class FulltextQuery {
056
057        public Op op;
058
059        /** The list of terms, if op is OR or AND */
060        public List<FulltextQuery> terms;
061
062        /** The word, if op is WORD or NOTWORD */
063        public String word;
064
065        /**
066         * Checks if the word is a phrase.
067         */
068        public boolean isPhrase() {
069            return word != null && word.contains(SPACE);
070        }
071    }
072
073    protected FulltextQuery ft = new FulltextQuery();
074
075    protected List<FulltextQuery> terms = new LinkedList<FulltextQuery>();
076
077    protected FulltextQuery analyze(String query) {
078        query = query.replaceAll(" +", " ").trim();
079        if (query.trim().length() == 0) {
080            return null;
081        }
082        ft.op = Op.OR;
083        ft.terms = new LinkedList<FulltextQuery>();
084        // current sequence of ANDed terms
085        boolean wasOr = false;
086        String[] words = split(query);
087        for (Iterator<String> it = Arrays.asList(words).iterator(); it.hasNext();) {
088            boolean plus = false;
089            boolean minus = false;
090            String word = it.next();
091            if (ignored(word)) {
092                continue;
093            }
094            if (word.startsWith(PLUS)) {
095                plus = true;
096                word = word.substring(1);
097            } else if (word.startsWith(MINUS)) {
098                minus = true;
099                word = word.substring(1);
100            }
101            if (word.startsWith(DOUBLE_QUOTES)) {
102                // read phrase
103                word = word.substring(1);
104                StringBuilder phrase = null;
105                while (true) {
106                    boolean end = word.endsWith(DOUBLE_QUOTES);
107                    if (end) {
108                        word = word.substring(0, word.length() - 1).trim();
109                    }
110                    if (word.contains(DOUBLE_QUOTES)) {
111                        throw new QueryParseException("Invalid fulltext query (double quotes in word): " + query);
112                    }
113                    if (word.length() != 0) {
114                        if (phrase == null) {
115                            phrase = new StringBuilder();
116                        } else {
117                            phrase.append(CSPACE);
118                        }
119                        phrase.append(word);
120                    }
121                    if (end) {
122                        break;
123                    }
124                    if (!it.hasNext()) {
125                        throw new QueryParseException("Invalid fulltext query (unterminated phrase): " + query);
126                    }
127                    word = it.next();
128                }
129                if (phrase == null) {
130                    continue;
131                }
132                word = phrase.toString();
133            } else if (word.equalsIgnoreCase(OR)) {
134                if (wasOr) {
135                    throw new QueryParseException("Invalid fulltext query (OR OR): " + query);
136                }
137                if (terms.isEmpty()) {
138                    throw new QueryParseException("Invalid fulltext query (standalone OR): " + query);
139                }
140                wasOr = true;
141                continue;
142            }
143            FulltextQuery w = new FulltextQuery();
144            if (minus) {
145                if (word.length() == 0) {
146                    throw new QueryParseException("Invalid fulltext query (standalone -): " + query);
147                }
148                w.op = Op.NOTWORD;
149            } else {
150                if (plus) {
151                    if (word.length() == 0) {
152                        throw new QueryParseException("Invalid fulltext query (standalone +): " + query);
153                    }
154                }
155                w.op = Op.WORD;
156            }
157            if (wasOr) {
158                endAnd();
159                wasOr = false;
160            }
161            w.word = word;
162            terms.add(w);
163        }
164        if (wasOr) {
165            throw new QueryParseException("Invalid fulltext query (final OR): " + query);
166        }
167        // final terms
168        endAnd();
169        int size = ft.terms.size();
170        if (size == 0) {
171            // all terms were negative
172            return null;
173        } else if (size == 1) {
174            // simplify when no OR
175            ft = ft.terms.get(0);
176        }
177        return ft;
178    }
179
180    protected String[] split(String query) {
181        return SEPARATOR.split(query);
182    }
183
184    protected boolean ignored(String word) {
185        if ("-".equals(word) || "+".equals(word) || word.contains("\"")) {
186            return false; // dealt with later, different error
187        }
188        return IGNORED.matcher(word).matches();
189    }
190
191    // add current ANDed terms to global OR
192    protected void endAnd() {
193        // put negative words at the end
194        List<FulltextQuery> pos = new LinkedList<FulltextQuery>();
195        List<FulltextQuery> neg = new LinkedList<FulltextQuery>();
196        for (FulltextQuery term : terms) {
197            if (term.op == Op.NOTWORD) {
198                neg.add(term);
199            } else {
200                pos.add(term);
201            }
202        }
203        if (!pos.isEmpty()) {
204            terms = pos;
205            terms.addAll(neg);
206            if (terms.size() == 1) {
207                ft.terms.add(terms.get(0));
208            } else {
209                FulltextQuery a = new FulltextQuery();
210                a.op = Op.AND;
211                a.terms = terms;
212                ft.terms.add(a);
213            }
214        }
215        terms = new LinkedList<FulltextQuery>();
216    }
217
218    public static void translate(FulltextQuery ft, StringBuilder buf, String or, String and, String andNot,
219            String wordStart, String wordEnd, Set<Character> wordCharsReserved, String phraseStart, String phraseEnd,
220            boolean quotePhraseWords) {
221        if (ft.op == Op.AND || ft.op == Op.OR) {
222            buf.append('(');
223            for (int i = 0; i < ft.terms.size(); i++) {
224                FulltextQuery term = ft.terms.get(i);
225                if (i > 0) {
226                    buf.append(' ');
227                    if (ft.op == Op.OR) {
228                        buf.append(or);
229                    } else { // Op.AND
230                        if (term.op == Op.NOTWORD) {
231                            buf.append(andNot);
232                        } else {
233                            buf.append(and);
234                        }
235                    }
236                    buf.append(' ');
237                }
238                translate(term, buf, or, and, andNot, wordStart, wordEnd, wordCharsReserved, phraseStart, phraseEnd,
239                        quotePhraseWords);
240            }
241            buf.append(')');
242            return;
243        } else {
244            String word = ft.word;
245            if (ft.isPhrase()) {
246                if (quotePhraseWords) {
247                    boolean first = true;
248                    for (String w : word.split(" ")) {
249                        if (!first) {
250                            buf.append(" ");
251                        }
252                        first = false;
253                        appendWord(w, buf, wordStart, wordEnd, wordCharsReserved);
254                    }
255                } else {
256                    buf.append(phraseStart);
257                    buf.append(word);
258                    buf.append(phraseEnd);
259                }
260            } else {
261                appendWord(word, buf, wordStart, wordEnd, wordCharsReserved);
262            }
263        }
264    }
265
266    protected static void appendWord(String word, StringBuilder buf, String start, String end, Set<Character> reserved) {
267        boolean quote = true;
268        if (!reserved.isEmpty()) {
269            for (char c : word.toCharArray()) {
270                if (reserved.contains(Character.valueOf(c))) {
271                    quote = false;
272                    break;
273                }
274            }
275        }
276        if (quote) {
277            buf.append(start);
278        }
279        buf.append(word);
280        if (quote) {
281            buf.append(end);
282        }
283    }
284
285    public static boolean hasPhrase(FulltextQuery ft) {
286        if (ft.op == Op.AND || ft.op == Op.OR) {
287            for (FulltextQuery term : ft.terms) {
288                if (hasPhrase(term)) {
289                    return true;
290                }
291            }
292            return false;
293        } else {
294            return ft.isPhrase();
295        }
296    }
297
298    /**
299     * Analyzes a fulltext query into a generic datastructure that can be used for each specific database.
300     * <p>
301     * List of terms containing only negative words are suppressed. Otherwise negative words are put at the end of the
302     * lists of terms.
303     */
304    public static FulltextQuery analyzeFulltextQuery(String query) {
305        return new FulltextQueryAnalyzer().analyze(query);
306    }
307
308    /**
309     * Translate fulltext into a common pattern used by many servers.
310     */
311    public static String translateFulltext(FulltextQuery ft, String or, String and, String andNot, String phraseQuote) {
312        StringBuilder buf = new StringBuilder();
313        translate(ft, buf, or, and, andNot, "", "", Collections.<Character> emptySet(), phraseQuote, phraseQuote, false);
314        return buf.toString();
315    }
316
317    /**
318     * Translate fulltext into a common pattern used by many servers.
319     */
320    public static String translateFulltext(FulltextQuery ft, String or, String and, String andNot, String wordStart,
321            String wordEnd, Set<Character> wordCharsReserved, String phraseStart, String phraseEnd,
322            boolean quotePhraseWords) {
323        StringBuilder buf = new StringBuilder();
324        translate(ft, buf, or, and, andNot, wordStart, wordEnd, wordCharsReserved, phraseStart, phraseEnd,
325                quotePhraseWords);
326        return buf.toString();
327    }
328
329}