001/*
002 * (C) Copyright 2006-2008 Nuxeo SAS (http://nuxeo.com/) and contributors.
003 *
004 * All rights reserved. This program and the accompanying materials
005 * are made available under the terms of the GNU Lesser General Public License
006 * (LGPL) version 2.1 which accompanies this distribution, and is available at
007 * http://www.gnu.org/licenses/lgpl.html
008 *
009 * This library is distributed in the hope that it will be useful,
010 * but WITHOUT ANY WARRANTY; without even the implied warranty of
011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
012 * Lesser General Public License for more details.
013 *
014 * Contributors:
015 *     bstefanescu
016 *
017 * $Id$
018 */
019
020package org.nuxeo.ecm.core.convert.plugins.text.extractors;
021
022import java.io.File;
023import java.io.IOException;
024import java.io.InputStream;
025
026import javax.xml.parsers.ParserConfigurationException;
027import javax.xml.parsers.SAXParser;
028import javax.xml.parsers.SAXParserFactory;
029
030import org.xml.sax.Attributes;
031import org.xml.sax.InputSource;
032import org.xml.sax.SAXException;
033import org.xml.sax.XMLReader;
034import org.xml.sax.helpers.DefaultHandler;
035
036/**
037 * @author <a href="mailto:bs@nuxeo.com">Bogdan Stefanescu</a>
038 */
039public class Xml2TextHandler extends DefaultHandler {
040
041    protected static final SAXParserFactory factory = SAXParserFactory.newInstance();
042
043    static {
044        factory.setValidating(false);
045        factory.setNamespaceAware(false);
046    }
047
048    protected SAXParser parser;
049
050    protected StringBuffer buf;
051
052    protected boolean trim = false;
053
054    public Xml2TextHandler() throws SAXException, ParserConfigurationException {
055        parser = factory.newSAXParser();
056        XMLReader reader = parser.getXMLReader();
057        reader.setFeature("http://xml.org/sax/features/validation", false);
058        reader.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
059        reader.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true);
060    }
061
062    public SAXParser getParser() {
063        return parser;
064    }
065
066    public String parse(File file) throws SAXException, IOException {
067        parser.parse(file, this);
068        String text = buf.toString();
069        buf = null;
070        return text;
071    }
072
073    public String parse(InputStream in) throws SAXException, IOException {
074        parser.parse(in, this);
075        String text = buf.toString();
076        buf = null;
077        return text;
078    }
079
080    public String parse(InputSource is) throws SAXException, IOException {
081        parser.parse(is, this);
082        String text = buf.toString();
083        buf = null;
084        return text;
085    }
086
087    public String getText() {
088        return buf.toString();
089    }
090
091    @Override
092    public void startDocument() throws SAXException {
093        trim = false;
094        buf = new StringBuffer();
095    }
096
097    @Override
098    public void startElement(String uri, String localName, String name, Attributes attributes) throws SAXException {
099        trim = true;
100    }
101
102    @Override
103    public void endElement(String uri, String localName, String name) throws SAXException {
104        trim = true;
105    }
106
107    @Override
108    public void characters(char[] ch, int start, int length) throws SAXException {
109        // buf.append(ch, start, length); if (true) return;
110        if (trim) {
111            int i = start;
112            int end = start + length;
113            while (i < end && Character.isWhitespace(ch[i])) {
114                i++;
115            }
116            buf.append(" ").append(ch, i, length - i + start);
117            trim = false;
118            // System.out.println("["+new String(ch, i, length - i + start)+"]");
119        } else {
120            buf.append(ch, start, length);
121            // System.out.println("{"+new String(ch, start, length)+"}");
122        }
123    }
124
125}