001/*
002 * (C) Copyright 2012-2014 Nuxeo SA (http://nuxeo.com/) and contributors.
003 *
004 * All rights reserved. This program and the accompanying materials
005 * are made available under the terms of the GNU Lesser General Public License
006 * (LGPL) version 2.1 which accompanies this distribution, and is available at
007 * http://www.gnu.org/licenses/lgpl.html
008 *
009 * This library is distributed in the hope that it will be useful,
010 * but WITHOUT ANY WARRANTY; without even the implied warranty of
011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
012 * Lesser General Public License for more details.
013 *
014 * Contributors:
015 *     Florent Guillaume
016 */
017package org.nuxeo.ecm.core.storage;
018
019import java.util.ArrayList;
020import java.util.List;
021import java.util.regex.Pattern;
022
023import net.htmlparser.jericho.Renderer;
024import net.htmlparser.jericho.Source;
025
026import org.apache.commons.lang.StringEscapeUtils;
027import org.apache.commons.lang.StringUtils;
028import org.nuxeo.runtime.api.Framework;
029
030/**
031 * Default fulltext parser, based on word and punctuation split, and lowercase normalization.
032 * <p>
033 * The regexp used can be configured using the system property {@value #WORD_SPLIT_PROP}. The default is
034 * {@value #WORD_SPLIT_DEF}.
035 *
036 * @since 5.9.5
037 */
038public class DefaultFulltextParser implements FulltextParser {
039
040    public static final String WORD_SPLIT_PROP = "org.nuxeo.fulltext.wordsplit";
041
042    public static final String WORD_SPLIT_DEF = "[\\s\\p{Punct}]+";
043
044    protected static final Pattern WORD_SPLIT_PATTERN = Pattern.compile(Framework.getProperty(WORD_SPLIT_PROP,
045            WORD_SPLIT_DEF));
046
047    @Override
048    public String parse(String s, String path) {
049        List<String> strings = new ArrayList<>();
050        parse(s, path, strings);
051        return StringUtils.join(strings, ' ');
052    }
053
054    /**
055     * {@inheritDoc}
056     * <p>
057     * The default implementation normalizes text to lowercase and removes punctuation.
058     * <p>
059     * This can be subclassed.
060     */
061    @Override
062    public void parse(String s, String path, List<String> strings) {
063        s = preprocessField(s, path);
064        for (String word : WORD_SPLIT_PATTERN.split(s)) {
065            if (!word.isEmpty()) {
066                strings.add(word.toLowerCase());
067            }
068        }
069    }
070
071    /**
072     * Preprocesses one field at the given path.
073     * <p>
074     * The path is unused for now.
075     */
076    protected String preprocessField(String s, String path) {
077        if (s == null) {
078            return null;
079        }
080        if (s.contains("<")) {
081            s = removeHtml(s);
082        }
083        return StringEscapeUtils.unescapeHtml(s);
084    }
085
086    protected String removeHtml(String s) {
087        Source source = new Source(s);
088        Renderer renderer = source.getRenderer();
089        renderer.setIncludeHyperlinkURLs(false);
090        renderer.setDecorateFontStyles(false);
091        return renderer.toString();
092    }
093
094}