001/*
002 * (C) Copyright 2006-2014 Nuxeo SA (http://nuxeo.com/) and others.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 *
016 * Contributors:
017 *     Florent Guillaume
018 */
019package org.nuxeo.ecm.core.storage;
020
021import java.util.Arrays;
022import java.util.Collections;
023import java.util.Iterator;
024import java.util.LinkedList;
025import java.util.List;
026import java.util.Set;
027import java.util.regex.Pattern;
028
029import org.nuxeo.ecm.core.query.QueryParseException;
030
031/**
032 * Structured fulltext query analyzer.
033 */
034public class FulltextQueryAnalyzer {
035
036    protected static final String SPACE = " ";
037
038    protected static final String PLUS = "+";
039
040    protected static final String MINUS = "-";
041
042    protected static final char CSPACE = ' ';
043
044    protected static final String DOUBLE_QUOTES = "\"";
045
046    protected static final String OR = "OR";
047
048    protected static final Pattern SEPARATOR = Pattern.compile("[ ]");
049
050    protected static final Pattern IGNORED = Pattern.compile("\\p{Punct}+");
051
052    /**
053     * Structured fulltext query operator.
054     */
055    public enum Op {
056        OR, AND, WORD, NOTWORD
057    }
058
059    /**
060     * Structured fulltext query.
061     */
062    public static class FulltextQuery {
063
064        public Op op;
065
066        /** The list of terms, if op is OR or AND */
067        public List<FulltextQuery> terms;
068
069        /** The word, if op is WORD or NOTWORD */
070        public String word;
071
072        /**
073         * Checks if the word is a phrase.
074         */
075        public boolean isPhrase() {
076            return word != null && word.contains(SPACE);
077        }
078    }
079
080    protected FulltextQuery ft = new FulltextQuery();
081
082    protected List<FulltextQuery> terms = new LinkedList<>();
083
084    protected FulltextQuery analyze(String query) {
085        query = query.replaceAll(" +", " ").trim();
086        if (query.trim().length() == 0) {
087            return null;
088        }
089        ft.op = Op.OR;
090        ft.terms = new LinkedList<>();
091        // current sequence of ANDed terms
092        boolean wasOr = false;
093        String[] words = split(query);
094        for (Iterator<String> it = Arrays.asList(words).iterator(); it.hasNext();) {
095            boolean plus = false;
096            boolean minus = false;
097            String word = it.next();
098            if (ignored(word)) {
099                continue;
100            }
101            if (word.startsWith(PLUS)) {
102                plus = true;
103                word = word.substring(1);
104            } else if (word.startsWith(MINUS)) {
105                minus = true;
106                word = word.substring(1);
107            }
108            if (word.startsWith(DOUBLE_QUOTES)) {
109                // read phrase
110                word = word.substring(1);
111                StringBuilder phrase = null;
112                while (true) {
113                    boolean end = word.endsWith(DOUBLE_QUOTES);
114                    if (end) {
115                        word = word.substring(0, word.length() - 1).trim();
116                    }
117                    if (word.contains(DOUBLE_QUOTES)) {
118                        throw new QueryParseException("Invalid fulltext query (double quotes in word): " + query);
119                    }
120                    if (word.length() != 0) {
121                        if (phrase == null) {
122                            phrase = new StringBuilder();
123                        } else {
124                            phrase.append(CSPACE);
125                        }
126                        phrase.append(word);
127                    }
128                    if (end) {
129                        break;
130                    }
131                    if (!it.hasNext()) {
132                        throw new QueryParseException("Invalid fulltext query (unterminated phrase): " + query);
133                    }
134                    word = it.next();
135                }
136                if (phrase == null) {
137                    continue;
138                }
139                word = phrase.toString();
140            } else if (word.equalsIgnoreCase(OR)) {
141                if (wasOr) {
142                    throw new QueryParseException("Invalid fulltext query (OR OR): " + query);
143                }
144                if (terms.isEmpty()) {
145                    throw new QueryParseException("Invalid fulltext query (standalone OR): " + query);
146                }
147                wasOr = true;
148                continue;
149            }
150            FulltextQuery w = new FulltextQuery();
151            if (minus) {
152                if (word.length() == 0) {
153                    throw new QueryParseException("Invalid fulltext query (standalone -): " + query);
154                }
155                w.op = Op.NOTWORD;
156            } else {
157                if (plus) {
158                    if (word.length() == 0) {
159                        throw new QueryParseException("Invalid fulltext query (standalone +): " + query);
160                    }
161                }
162                w.op = Op.WORD;
163            }
164            if (wasOr) {
165                endAnd();
166                wasOr = false;
167            }
168            w.word = word;
169            terms.add(w);
170        }
171        if (wasOr) {
172            throw new QueryParseException("Invalid fulltext query (final OR): " + query);
173        }
174        // final terms
175        endAnd();
176        int size = ft.terms.size();
177        if (size == 0) {
178            // all terms were negative
179            return null;
180        } else if (size == 1) {
181            // simplify when no OR
182            ft = ft.terms.get(0);
183        }
184        return ft;
185    }
186
187    protected String[] split(String query) {
188        return SEPARATOR.split(query);
189    }
190
191    protected boolean ignored(String word) {
192        if ("-".equals(word) || "+".equals(word) || word.contains("\"")) {
193            return false; // dealt with later, different error
194        }
195        return IGNORED.matcher(word).matches();
196    }
197
198    // add current ANDed terms to global OR
199    protected void endAnd() {
200        // put negative words at the end
201        List<FulltextQuery> pos = new LinkedList<>();
202        List<FulltextQuery> neg = new LinkedList<>();
203        for (FulltextQuery term : terms) {
204            if (term.op == Op.NOTWORD) {
205                neg.add(term);
206            } else {
207                pos.add(term);
208            }
209        }
210        if (!pos.isEmpty()) {
211            terms = pos;
212            terms.addAll(neg);
213            if (terms.size() == 1) {
214                ft.terms.add(terms.get(0));
215            } else {
216                FulltextQuery a = new FulltextQuery();
217                a.op = Op.AND;
218                a.terms = terms;
219                ft.terms.add(a);
220            }
221        }
222        terms = new LinkedList<>();
223    }
224
225    public static void translate(FulltextQuery ft, StringBuilder sb, String or, String and, String andNot,
226            String wordStart, String wordEnd, Set<Character> wordCharsReserved, String phraseStart, String phraseEnd,
227            boolean quotePhraseWords) {
228        if (ft.op == Op.AND || ft.op == Op.OR) {
229            sb.append('(');
230            for (int i = 0; i < ft.terms.size(); i++) {
231                FulltextQuery term = ft.terms.get(i);
232                if (i > 0) {
233                    sb.append(' ');
234                    if (ft.op == Op.OR) {
235                        sb.append(or);
236                    } else { // Op.AND
237                        if (term.op == Op.NOTWORD) {
238                            sb.append(andNot);
239                        } else {
240                            sb.append(and);
241                        }
242                    }
243                    sb.append(' ');
244                }
245                translate(term, sb, or, and, andNot, wordStart, wordEnd, wordCharsReserved, phraseStart, phraseEnd,
246                        quotePhraseWords);
247            }
248            sb.append(')');
249            return;
250        } else {
251            String word = ft.word;
252            if (ft.isPhrase()) {
253                if (quotePhraseWords) {
254                    boolean first = true;
255                    for (String w : word.split(" ")) {
256                        if (!first) {
257                            sb.append(" ");
258                        }
259                        first = false;
260                        appendWord(w, sb, wordStart, wordEnd, wordCharsReserved);
261                    }
262                } else {
263                    sb.append(phraseStart);
264                    sb.append(word);
265                    sb.append(phraseEnd);
266                }
267            } else {
268                appendWord(word, sb, wordStart, wordEnd, wordCharsReserved);
269            }
270        }
271    }
272
273    protected static void appendWord(String word, StringBuilder sb, String start, String end, Set<Character> reserved) {
274        boolean quote = true;
275        if (!reserved.isEmpty()) {
276            for (char c : word.toCharArray()) {
277                if (reserved.contains(Character.valueOf(c))) {
278                    quote = false;
279                    break;
280                }
281            }
282        }
283        if (quote) {
284            sb.append(start);
285        }
286        sb.append(word);
287        if (quote) {
288            sb.append(end);
289        }
290    }
291
292    public static boolean hasPhrase(FulltextQuery ft) {
293        if (ft.op == Op.AND || ft.op == Op.OR) {
294            for (FulltextQuery term : ft.terms) {
295                if (hasPhrase(term)) {
296                    return true;
297                }
298            }
299            return false;
300        } else {
301            return ft.isPhrase();
302        }
303    }
304
305    /**
306     * Analyzes a fulltext query into a generic datastructure that can be used for each specific database.
307     * <p>
308     * List of terms containing only negative words are suppressed. Otherwise negative words are put at the end of the
309     * lists of terms.
310     */
311    public static FulltextQuery analyzeFulltextQuery(String query) {
312        return new FulltextQueryAnalyzer().analyze(query);
313    }
314
315    /**
316     * Translate fulltext into a common pattern used by many servers.
317     */
318    public static String translateFulltext(FulltextQuery ft, String or, String and, String andNot, String phraseQuote) {
319        StringBuilder sb = new StringBuilder();
320        translate(ft, sb, or, and, andNot, "", "", Collections.<Character> emptySet(), phraseQuote, phraseQuote, false);
321        return sb.toString();
322    }
323
324    /**
325     * Translate fulltext into a common pattern used by many servers.
326     */
327    public static String translateFulltext(FulltextQuery ft, String or, String and, String andNot, String wordStart,
328            String wordEnd, Set<Character> wordCharsReserved, String phraseStart, String phraseEnd,
329            boolean quotePhraseWords) {
330        StringBuilder sb = new StringBuilder();
331        translate(ft, sb, or, and, andNot, wordStart, wordEnd, wordCharsReserved, phraseStart, phraseEnd,
332                quotePhraseWords);
333        return sb.toString();
334    }
335
336}