001/* 002 * (C) Copyright 2012-2014 Nuxeo SA (http://nuxeo.com/) and others. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 * 016 * Contributors: 017 * Florent Guillaume 018 */ 019package org.nuxeo.ecm.core.storage; 020 021import java.util.ArrayList; 022import java.util.List; 023import java.util.regex.Pattern; 024 025import net.htmlparser.jericho.Renderer; 026import net.htmlparser.jericho.Source; 027 028import org.apache.commons.lang.StringEscapeUtils; 029import org.apache.commons.lang.StringUtils; 030import org.nuxeo.ecm.core.api.DocumentLocation; 031import org.nuxeo.runtime.api.Framework; 032 033/** 034 * Default fulltext parser, based on word and punctuation split, and lowercase normalization. 035 * <p> 036 * The regexp used can be configured using the system property {@value #WORD_SPLIT_PROP}. The default is 037 * {@value #WORD_SPLIT_DEF}. 038 * 039 * @since 5.9.5 040 */ 041public class DefaultFulltextParser implements FulltextParser { 042 043 public static final String WORD_SPLIT_PROP = "org.nuxeo.fulltext.wordsplit"; 044 045 public static final String WORD_SPLIT_DEF = "[\\s\\p{Punct}]+"; 046 047 protected static final Pattern WORD_SPLIT_PATTERN = Pattern.compile(Framework.getProperty(WORD_SPLIT_PROP, 048 WORD_SPLIT_DEF)); 049 050 protected static final int HTML_MAGIC_OFFSET = 8192; 051 052 protected static final String TEXT_HTML = "text/html"; 053 054 @Override 055 public String parse(String s, String path) { 056 return parse(s, path, null, null); 057 } 058 059 @Override 060 public void parse(String s, String path, List<String> strings) { 061 parse(s, path, null, null, strings); 062 } 063 064 @Override 065 public String parse(String s, String path, String mimeType, DocumentLocation documentLocation) { 066 List<String> strings = new ArrayList<>(); 067 parse(s, path, mimeType, documentLocation, strings); 068 return StringUtils.join(strings, ' '); 069 } 070 071 /** 072 * {@inheritDoc} 073 * <p> 074 * The default implementation normalizes text to lowercase and removes punctuation. The documentLocation parameter 075 * is currently unused but has some use cases for potential subclasses. 076 * <p> 077 * This can be subclassed. 078 */ 079 @Override 080 public void parse(String s, String path, String mimeType, DocumentLocation documentLocation, List<String> strings) { 081 s = preprocessField(s, path, mimeType); 082 for (String word : WORD_SPLIT_PATTERN.split(s)) { 083 if (!word.isEmpty()) { 084 strings.add(word.toLowerCase()); 085 } 086 } 087 } 088 089 /** 090 * Preprocesses one field at the given path. 091 * <p> 092 * The path is unused for now. 093 */ 094 protected String preprocessField(String s, String path, String mimeType) { 095 if (s == null) { 096 return null; 097 } 098 if (StringUtils.isEmpty(mimeType)) { 099 // Use weak HTML detection here since nuxeo-core-mimetype 'magic.xml' has text/html detection commented 100 String htmlMagicExtraction = s.substring(0, Math.min(s.length(), HTML_MAGIC_OFFSET)); 101 String htmlMagicExtractionLC = htmlMagicExtraction.toLowerCase(); 102 if (htmlMagicExtractionLC.startsWith("<!doctype html") || htmlMagicExtractionLC.contains("<html")) { 103 mimeType = TEXT_HTML; 104 } 105 } 106 if (TEXT_HTML.equals(mimeType)) { 107 s = removeHtml(s); 108 } 109 return StringEscapeUtils.unescapeHtml(s); 110 } 111 112 protected String removeHtml(String s) { 113 Source source = new Source(s); 114 Renderer renderer = source.getRenderer(); 115 renderer.setIncludeHyperlinkURLs(false); 116 renderer.setDecorateFontStyles(false); 117 return renderer.toString(); 118 } 119 120}