001/* 002 * (C) Copyright 2012-2014 Nuxeo SA (http://nuxeo.com/) and contributors. 003 * 004 * All rights reserved. This program and the accompanying materials 005 * are made available under the terms of the GNU Lesser General Public License 006 * (LGPL) version 2.1 which accompanies this distribution, and is available at 007 * http://www.gnu.org/licenses/lgpl.html 008 * 009 * This library is distributed in the hope that it will be useful, 010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 012 * Lesser General Public License for more details. 013 * 014 * Contributors: 015 * Florent Guillaume 016 */ 017package org.nuxeo.ecm.core.storage; 018 019import java.util.ArrayList; 020import java.util.List; 021import java.util.regex.Pattern; 022 023import net.htmlparser.jericho.Renderer; 024import net.htmlparser.jericho.Source; 025 026import org.apache.commons.lang.StringEscapeUtils; 027import org.apache.commons.lang.StringUtils; 028import org.nuxeo.runtime.api.Framework; 029 030/** 031 * Default fulltext parser, based on word and punctuation split, and lowercase normalization. 032 * <p> 033 * The regexp used can be configured using the system property {@value #WORD_SPLIT_PROP}. The default is 034 * {@value #WORD_SPLIT_DEF}. 035 * 036 * @since 5.9.5 037 */ 038public class DefaultFulltextParser implements FulltextParser { 039 040 public static final String WORD_SPLIT_PROP = "org.nuxeo.fulltext.wordsplit"; 041 042 public static final String WORD_SPLIT_DEF = "[\\s\\p{Punct}]+"; 043 044 protected static final Pattern WORD_SPLIT_PATTERN = Pattern.compile(Framework.getProperty(WORD_SPLIT_PROP, 045 WORD_SPLIT_DEF)); 046 047 @Override 048 public String parse(String s, String path) { 049 List<String> strings = new ArrayList<>(); 050 parse(s, path, strings); 051 return StringUtils.join(strings, ' '); 052 } 053 054 /** 055 * {@inheritDoc} 056 * <p> 057 * The default implementation normalizes text to lowercase and removes punctuation. 058 * <p> 059 * This can be subclassed. 060 */ 061 @Override 062 public void parse(String s, String path, List<String> strings) { 063 s = preprocessField(s, path); 064 for (String word : WORD_SPLIT_PATTERN.split(s)) { 065 if (!word.isEmpty()) { 066 strings.add(word.toLowerCase()); 067 } 068 } 069 } 070 071 /** 072 * Preprocesses one field at the given path. 073 * <p> 074 * The path is unused for now. 075 */ 076 protected String preprocessField(String s, String path) { 077 if (s == null) { 078 return null; 079 } 080 if (s.contains("<")) { 081 s = removeHtml(s); 082 } 083 return StringEscapeUtils.unescapeHtml(s); 084 } 085 086 protected String removeHtml(String s) { 087 Source source = new Source(s); 088 Renderer renderer = source.getRenderer(); 089 renderer.setIncludeHyperlinkURLs(false); 090 renderer.setDecorateFontStyles(false); 091 return renderer.toString(); 092 } 093 094}