001/* 002 * (C) Copyright 2006-2007 Nuxeo SA (http://nuxeo.com/) and others. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 * 016 * Contributors: 017 * Nuxeo - initial API and implementation 018 * 019 */ 020package org.nuxeo.ecm.core.convert.plugins.text.extractors; 021 022import java.io.IOException; 023import java.io.InputStreamReader; 024import java.io.Reader; 025import java.io.Serializable; 026import java.util.Map; 027 028import org.nuxeo.ecm.core.api.Blob; 029import org.nuxeo.ecm.core.api.Blobs; 030import org.nuxeo.ecm.core.api.blobholder.BlobHolder; 031import org.nuxeo.ecm.core.api.impl.blob.StringBlob; 032import org.nuxeo.ecm.core.convert.api.ConversionException; 033import org.nuxeo.ecm.core.convert.cache.SimpleCachableBlobHolder; 034import org.nuxeo.ecm.core.convert.extension.Converter; 035import org.nuxeo.ecm.core.convert.extension.ConverterDescriptor; 036 037import net.htmlparser.jericho.Renderer; 038import net.htmlparser.jericho.Source; 039 040/** 041 * Extract the text content of HTML documents while trying to respect the paragraph structure. 042 * 043 * @author <a href="mailto:troger@nuxeo.com">Thomas Roger</a> 044 * @author <a href="mailto:ogrisel@nuxeo.com">Olivier Grisel</a> 045 */ 046public class Html2TextConverter implements Converter { 047 048 @Override 049 public BlobHolder convert(BlobHolder blobHolder, Map<String, Serializable> parameters) throws ConversionException { 050 051 try { 052 Blob blob = blobHolder.getBlob(); 053 // if the underlying source is unambiguously decoded, access the 054 // decoded string directly 055 Source source; 056 if (blob instanceof StringBlob) { 057 source = new Source(blob.getString()); 058 } else if (blob.getEncoding() != null) { 059 Reader reader = new InputStreamReader(blob.getStream(), blob.getEncoding()); 060 source = new Source(reader); 061 } else { 062 // otherwise use the parser charset heuristic to decode properly 063 source = new Source(blob.getStream()); 064 } 065 Renderer renderer = source.getRenderer(); 066 renderer.setIncludeHyperlinkURLs(false); 067 renderer.setDecorateFontStyles(false); 068 String text = renderer.toString(); 069 text = text.replaceAll("\r\n", "\n"); // unix end of line 070 text = text.replaceAll(" *\n", "\n"); // clean trailing spaces 071 text = text.replaceAll("\\n\\n+", "\n\n"); // clean multiple lines 072 text = text.trim(); 073 return new SimpleCachableBlobHolder(Blobs.createBlob(text)); 074 } catch (IOException e) { 075 throw new ConversionException("Error during Html2Text conversion", blobHolder, e); 076 } 077 } 078 079 @Override 080 public void init(ConverterDescriptor descriptor) { 081 } 082 083}