001/*
002 * (C) Copyright 2012-2014 Nuxeo SA (http://nuxeo.com/) and others.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 *
016 * Contributors:
017 *     Florent Guillaume
018 */
019package org.nuxeo.ecm.core.storage;
020
021import java.util.ArrayList;
022import java.util.List;
023import java.util.regex.Pattern;
024
025import net.htmlparser.jericho.Renderer;
026import net.htmlparser.jericho.Source;
027
028import org.apache.commons.lang.StringEscapeUtils;
029import org.apache.commons.lang.StringUtils;
030import org.nuxeo.runtime.api.Framework;
031
032/**
033 * Default fulltext parser, based on word and punctuation split, and lowercase normalization.
034 * <p>
035 * The regexp used can be configured using the system property {@value #WORD_SPLIT_PROP}. The default is
036 * {@value #WORD_SPLIT_DEF}.
037 *
038 * @since 5.9.5
039 */
040public class DefaultFulltextParser implements FulltextParser {
041
042    public static final String WORD_SPLIT_PROP = "org.nuxeo.fulltext.wordsplit";
043
044    public static final String WORD_SPLIT_DEF = "[\\s\\p{Punct}]+";
045
046    protected static final Pattern WORD_SPLIT_PATTERN = Pattern.compile(Framework.getProperty(WORD_SPLIT_PROP,
047            WORD_SPLIT_DEF));
048
049    @Override
050    public String parse(String s, String path) {
051        List<String> strings = new ArrayList<>();
052        parse(s, path, strings);
053        return StringUtils.join(strings, ' ');
054    }
055
056    /**
057     * {@inheritDoc}
058     * <p>
059     * The default implementation normalizes text to lowercase and removes punctuation.
060     * <p>
061     * This can be subclassed.
062     */
063    @Override
064    public void parse(String s, String path, List<String> strings) {
065        s = preprocessField(s, path);
066        for (String word : WORD_SPLIT_PATTERN.split(s)) {
067            if (!word.isEmpty()) {
068                strings.add(word.toLowerCase());
069            }
070        }
071    }
072
073    /**
074     * Preprocesses one field at the given path.
075     * <p>
076     * The path is unused for now.
077     */
078    protected String preprocessField(String s, String path) {
079        if (s == null) {
080            return null;
081        }
082        if (s.contains("<")) {
083            s = removeHtml(s);
084        }
085        return StringEscapeUtils.unescapeHtml(s);
086    }
087
088    protected String removeHtml(String s) {
089        Source source = new Source(s);
090        Renderer renderer = source.getRenderer();
091        renderer.setIncludeHyperlinkURLs(false);
092        renderer.setDecorateFontStyles(false);
093        return renderer.toString();
094    }
095
096}