001/*
002 * (C) Copyright 2006-2012 Nuxeo SA (http://nuxeo.com/) and contributors.
003 *
004 * All rights reserved. This program and the accompanying materials
005 * are made available under the terms of the GNU Lesser General Public License
006 * (LGPL) version 2.1 which accompanies this distribution, and is available at
007 * http://www.gnu.org/licenses/lgpl.html
008 *
009 * This library is distributed in the hope that it will be useful,
010 * but WITHOUT ANY WARRANTY; without even the implied warranty of
011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
012 * Lesser General Public License for more details.
013 *
014 * Contributors:
015 *     Nuxeo
016 *     Antoine Taillefer
017 */
018
019package org.nuxeo.ecm.core.convert.plugins.text.extractors;
020
021import java.util.Stack;
022
023import org.xml.sax.Attributes;
024import org.xml.sax.SAXException;
025import org.xml.sax.helpers.DefaultHandler;
026
027public class OpenXmlContentHandler extends DefaultHandler {
028
029    protected final StringBuffer sb = new StringBuffer();
030
031    protected final Stack<String> path = new Stack<String>();
032
033    protected boolean dumpText = false;
034
035    public String getContent() {
036        return sb.toString();
037    }
038
039    @Override
040    public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException {
041        path.push(qName);
042        // Text element of a docx or pptx document
043        if (qName.equals("w:t") || qName.equals("a:t")) {
044            dumpText = true;
045        }
046        // If the paragraph's style is "styleX" with X > 1 (this is a heading,
047        // but not the document title), add a new line.
048        if (qName.equals("w:pStyle") && !"style0".equals(atts.getValue("w:val"))
049                && !"style1".equals(atts.getValue("w:val"))) {
050            sb.append("\n");
051
052        }
053        // Paragraph: add a new line
054        if (qName.equals("w:p") || qName.equals("a:p")) {
055            sb.append("\n");
056        }
057    }
058
059    @Override
060    public void characters(char[] ch, int start, int length) throws SAXException {
061        if (dumpText) {
062            String content = String.valueOf(ch, start, length);
063            sb.append(content);
064        }
065    }
066
067    @Override
068    public void endElement(String namespaceURI, String localName, String qName) throws SAXException {
069        path.pop();
070        if (path.isEmpty() || !path.lastElement().equals("w:t")) {
071            dumpText = false;
072        }
073    }
074
075}