001/* 002 * (C) Copyright 2012-2014 Nuxeo SA (http://nuxeo.com/) and others. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 * 016 * Contributors: 017 * Florent Guillaume 018 */ 019package org.nuxeo.ecm.core.storage; 020 021import java.util.ArrayList; 022import java.util.List; 023import java.util.regex.Pattern; 024 025import net.htmlparser.jericho.Renderer; 026import net.htmlparser.jericho.Source; 027 028import org.apache.commons.lang.StringEscapeUtils; 029import org.apache.commons.lang.StringUtils; 030import org.nuxeo.runtime.api.Framework; 031 032/** 033 * Default fulltext parser, based on word and punctuation split, and lowercase normalization. 034 * <p> 035 * The regexp used can be configured using the system property {@value #WORD_SPLIT_PROP}. The default is 036 * {@value #WORD_SPLIT_DEF}. 037 * 038 * @since 5.9.5 039 */ 040public class DefaultFulltextParser implements FulltextParser { 041 042 public static final String WORD_SPLIT_PROP = "org.nuxeo.fulltext.wordsplit"; 043 044 public static final String WORD_SPLIT_DEF = "[\\s\\p{Punct}]+"; 045 046 protected static final Pattern WORD_SPLIT_PATTERN = Pattern.compile(Framework.getProperty(WORD_SPLIT_PROP, 047 WORD_SPLIT_DEF)); 048 049 @Override 050 public String parse(String s, String path) { 051 List<String> strings = new ArrayList<>(); 052 parse(s, path, strings); 053 return StringUtils.join(strings, ' '); 054 } 055 056 /** 057 * {@inheritDoc} 058 * <p> 059 * The default implementation normalizes text to lowercase and removes punctuation. 060 * <p> 061 * This can be subclassed. 062 */ 063 @Override 064 public void parse(String s, String path, List<String> strings) { 065 s = preprocessField(s, path); 066 for (String word : WORD_SPLIT_PATTERN.split(s)) { 067 if (!word.isEmpty()) { 068 strings.add(word.toLowerCase()); 069 } 070 } 071 } 072 073 /** 074 * Preprocesses one field at the given path. 075 * <p> 076 * The path is unused for now. 077 */ 078 protected String preprocessField(String s, String path) { 079 if (s == null) { 080 return null; 081 } 082 if (s.contains("<")) { 083 s = removeHtml(s); 084 } 085 return StringEscapeUtils.unescapeHtml(s); 086 } 087 088 protected String removeHtml(String s) { 089 Source source = new Source(s); 090 Renderer renderer = source.getRenderer(); 091 renderer.setIncludeHyperlinkURLs(false); 092 renderer.setDecorateFontStyles(false); 093 return renderer.toString(); 094 } 095 096}