001/*
002 * (C) Copyright 2006-2008 Nuxeo SA (http://nuxeo.com/) and others.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 *
016 * Contributors:
017 *     bstefanescu
018 *
019 * $Id$
020 */
021
022package org.nuxeo.ecm.core.convert.plugins.text.extractors;
023
024import java.io.File;
025import java.io.IOException;
026import java.io.InputStream;
027
028import javax.xml.parsers.ParserConfigurationException;
029import javax.xml.parsers.SAXParser;
030import javax.xml.parsers.SAXParserFactory;
031
032import org.xml.sax.Attributes;
033import org.xml.sax.InputSource;
034import org.xml.sax.SAXException;
035import org.xml.sax.XMLReader;
036import org.xml.sax.helpers.DefaultHandler;
037
038/**
039 * @author <a href="mailto:bs@nuxeo.com">Bogdan Stefanescu</a>
040 */
041public class Xml2TextHandler extends DefaultHandler {
042
043    protected static final SAXParserFactory factory = SAXParserFactory.newInstance();
044
045    static {
046        factory.setValidating(false);
047        factory.setNamespaceAware(false);
048    }
049
050    protected SAXParser parser;
051
052    protected StringBuffer buf;
053
054    protected boolean trim = false;
055
056    public Xml2TextHandler() throws SAXException, ParserConfigurationException {
057        parser = factory.newSAXParser();
058        XMLReader reader = parser.getXMLReader();
059        reader.setFeature("http://xml.org/sax/features/validation", false);
060        reader.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
061        reader.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true);
062    }
063
064    public SAXParser getParser() {
065        return parser;
066    }
067
068    public String parse(File file) throws SAXException, IOException {
069        parser.parse(file, this);
070        String text = buf.toString();
071        buf = null;
072        return text;
073    }
074
075    public String parse(InputStream in) throws SAXException, IOException {
076        parser.parse(in, this);
077        String text = buf.toString();
078        buf = null;
079        return text;
080    }
081
082    public String parse(InputSource is) throws SAXException, IOException {
083        parser.parse(is, this);
084        String text = buf.toString();
085        buf = null;
086        return text;
087    }
088
089    public String getText() {
090        return buf.toString();
091    }
092
093    @Override
094    public void startDocument() throws SAXException {
095        trim = false;
096        buf = new StringBuffer();
097    }
098
099    @Override
100    public void startElement(String uri, String localName, String name, Attributes attributes) throws SAXException {
101        trim = true;
102    }
103
104    @Override
105    public void endElement(String uri, String localName, String name) throws SAXException {
106        trim = true;
107    }
108
109    @Override
110    public void characters(char[] ch, int start, int length) throws SAXException {
111        // buf.append(ch, start, length); if (true) return;
112        if (trim) {
113            int i = start;
114            int end = start + length;
115            while (i < end && Character.isWhitespace(ch[i])) {
116                i++;
117            }
118            buf.append(" ").append(ch, i, length - i + start);
119            trim = false;
120            // System.out.println("["+new String(ch, i, length - i + start)+"]");
121        } else {
122            buf.append(ch, start, length);
123            // System.out.println("{"+new String(ch, start, length)+"}");
124        }
125    }
126
127}