001/*
002 * (C) Copyright 2012-2014 Nuxeo SA (http://nuxeo.com/) and others.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 *
016 * Contributors:
017 *     Florent Guillaume
018 */
019package org.nuxeo.ecm.core.storage;
020
021import java.util.ArrayList;
022import java.util.List;
023import java.util.regex.Pattern;
024
025import net.htmlparser.jericho.Renderer;
026import net.htmlparser.jericho.Source;
027
028import org.apache.commons.lang.StringEscapeUtils;
029import org.apache.commons.lang.StringUtils;
030import org.nuxeo.ecm.core.api.DocumentLocation;
031import org.nuxeo.runtime.api.Framework;
032
033/**
034 * Default fulltext parser, based on word and punctuation split, and lowercase normalization.
035 * <p>
036 * The regexp used can be configured using the system property {@value #WORD_SPLIT_PROP}. The default is
037 * {@value #WORD_SPLIT_DEF}.
038 *
039 * @since 5.9.5
040 */
041public class DefaultFulltextParser implements FulltextParser {
042
043    public static final String WORD_SPLIT_PROP = "org.nuxeo.fulltext.wordsplit";
044
045    public static final String WORD_SPLIT_DEF = "[\\s\\p{Punct}]+";
046
047    protected static final Pattern WORD_SPLIT_PATTERN = Pattern.compile(Framework.getProperty(WORD_SPLIT_PROP,
048            WORD_SPLIT_DEF));
049
050    protected static final int HTML_MAGIC_OFFSET = 8192;
051
052    protected static final String TEXT_HTML = "text/html";
053
054    @Override
055    public String parse(String s, String path) {
056        return parse(s, path, null, null);
057    }
058
059    @Override
060    public void parse(String s, String path, List<String> strings) {
061        parse(s, path, null, null, strings);
062    }
063
064    @Override
065    public String parse(String s, String path, String mimeType, DocumentLocation documentLocation) {
066        List<String> strings = new ArrayList<>();
067        parse(s, path, mimeType, documentLocation, strings);
068        return StringUtils.join(strings, ' ');
069    }
070
071    /**
072     * {@inheritDoc}
073     * <p>
074     * The default implementation normalizes text to lowercase and removes punctuation. The documentLocation parameter
075     * is currently unused but has some use cases for potential subclasses.
076     * <p>
077     * This can be subclassed.
078     */
079    @Override
080    public void parse(String s, String path, String mimeType, DocumentLocation documentLocation, List<String> strings) {
081        s = preprocessField(s, path, mimeType);
082        for (String word : WORD_SPLIT_PATTERN.split(s)) {
083            if (!word.isEmpty()) {
084                strings.add(word.toLowerCase());
085            }
086        }
087    }
088
089    /**
090     * Preprocesses one field at the given path.
091     * <p>
092     * The path is unused for now.
093     */
094    protected String preprocessField(String s, String path, String mimeType) {
095        if (s == null) {
096            return null;
097        }
098        if (StringUtils.isEmpty(mimeType)) {
099            // Use weak HTML detection here since nuxeo-core-mimetype 'magic.xml' has text/html detection commented
100            String htmlMagicExtraction = s.substring(0, Math.min(s.length(), HTML_MAGIC_OFFSET));
101            String htmlMagicExtractionLC = htmlMagicExtraction.toLowerCase();
102            if (htmlMagicExtractionLC.startsWith("<!doctype html") || htmlMagicExtractionLC.contains("<html")) {
103                mimeType = TEXT_HTML;
104            }
105        }
106        if (TEXT_HTML.equals(mimeType)) {
107            s = removeHtml(s);
108        }
109        return StringEscapeUtils.unescapeHtml(s);
110    }
111
112    protected String removeHtml(String s) {
113        Source source = new Source(s);
114        Renderer renderer = source.getRenderer();
115        renderer.setIncludeHyperlinkURLs(false);
116        renderer.setDecorateFontStyles(false);
117        return renderer.toString();
118    }
119
120}