001/*
002 * (C) Copyright 2002-2007 Nuxeo SAS (http://nuxeo.com/) and contributors.
003 *
004 * All rights reserved. This program and the accompanying materials
005 * are made available under the terms of the GNU Lesser General Public License
006 * (LGPL) version 2.1 which accompanies this distribution, and is available at
007 * http://www.gnu.org/licenses/lgpl.html
008 *
009 * This library is distributed in the hope that it will be useful,
010 * but WITHOUT ANY WARRANTY; without even the implied warranty of
011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
012 * Lesser General Public License for more details.
013 *
014 * Contributors:
015 *     Nuxeo - initial API and implementation
016 *
017 */
018package org.nuxeo.ecm.core.convert.plugins.text.extractors;
019
020import java.io.IOException;
021import java.io.InputStream;
022import java.io.InputStreamReader;
023import java.io.Reader;
024import java.io.Serializable;
025import java.util.Map;
026
027import net.htmlparser.jericho.Renderer;
028import net.htmlparser.jericho.Source;
029
030import org.apache.commons.logging.Log;
031import org.apache.commons.logging.LogFactory;
032import org.nuxeo.ecm.core.api.Blob;
033import org.nuxeo.ecm.core.api.Blobs;
034import org.nuxeo.ecm.core.api.blobholder.BlobHolder;
035import org.nuxeo.ecm.core.api.impl.blob.StringBlob;
036import org.nuxeo.ecm.core.convert.api.ConversionException;
037import org.nuxeo.ecm.core.convert.cache.SimpleCachableBlobHolder;
038import org.nuxeo.ecm.core.convert.extension.Converter;
039import org.nuxeo.ecm.core.convert.extension.ConverterDescriptor;
040
041/**
042 * Extract the text content of HTML documents while trying to respect the paragraph structure.
043 *
044 * @author <a href="mailto:troger@nuxeo.com">Thomas Roger</a>
045 * @author <a href="mailto:ogrisel@nuxeo.com">Olivier Grisel</a>
046 */
047public class Html2TextConverter implements Converter {
048
049    private static final Log log = LogFactory.getLog(Html2TextConverter.class);
050
051    @Override
052    public BlobHolder convert(BlobHolder blobHolder, Map<String, Serializable> parameters) throws ConversionException {
053
054        InputStream stream = null;
055        try {
056            Blob blob = blobHolder.getBlob();
057            // if the underlying source is unambiguously decoded, access the
058            // decoded string directly
059            Source source;
060            if (blob instanceof StringBlob) {
061                source = new Source(blob.getString());
062            } else if (blob.getEncoding() != null) {
063                Reader reader = new InputStreamReader(blob.getStream(), blob.getEncoding());
064                source = new Source(reader);
065            } else {
066                // otherwise use the parser charset heuristic to decode properly
067                source = new Source(blob.getStream());
068            }
069            Renderer renderer = source.getRenderer();
070            renderer.setIncludeHyperlinkURLs(false);
071            renderer.setDecorateFontStyles(false);
072            String text = renderer.toString();
073            text = text.replaceAll("\r\n", "\n"); // unix end of line
074            text = text.replaceAll(" *\n", "\n"); // clean trailing spaces
075            text = text.replaceAll("\\n\\n+", "\n\n"); // clean multiple lines
076            text = text.trim();
077            return new SimpleCachableBlobHolder(Blobs.createBlob(text));
078        } catch (IOException e) {
079            throw new ConversionException("Error during Html2Text conversion", e);
080        } finally {
081            if (stream != null) {
082                try {
083                    stream.close();
084                } catch (IOException e) {
085                    log.error("Error while closing Blob stream", e);
086                }
087            }
088        }
089    }
090
091    @Override
092    public void init(ConverterDescriptor descriptor) {
093    }
094
095}