001/*
002 * (C) Copyright 2006-2012 Nuxeo SA (http://nuxeo.com/) and contributors.
003 *
004 * All rights reserved. This program and the accompanying materials
005 * are made available under the terms of the GNU Lesser General Public License
006 * (LGPL) version 2.1 which accompanies this distribution, and is available at
007 * http://www.gnu.org/licenses/lgpl.html
008 *
009 * This library is distributed in the hope that it will be useful,
010 * but WITHOUT ANY WARRANTY; without even the implied warranty of
011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
012 * Lesser General Public License for more details.
013 *
014 * Contributors:
015 *     Nuxeo
016 *     Antoine Taillefer
017 */
018
019package org.nuxeo.ecm.core.convert.plugins.text.extractors;
020
021import java.util.Stack;
022
023import org.apache.commons.lang.StringUtils;
024import org.apache.commons.logging.Log;
025import org.apache.commons.logging.LogFactory;
026import org.xml.sax.Attributes;
027import org.xml.sax.SAXException;
028import org.xml.sax.helpers.DefaultHandler;
029
030public class OOoXmlContentHandler extends DefaultHandler {
031
032    protected static final Log log = LogFactory.getLog(OOoXmlContentHandler.class);
033
034    protected final StringBuffer sb = new StringBuffer();
035
036    protected final Stack<String> path = new Stack<String>();
037
038    protected boolean dumpText = false;
039
040    protected boolean isSpreadSheet = false;
041
042    public String getContent() {
043        return sb.toString();
044    }
045
046    @Override
047    public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException {
048        path.push(qName);
049
050        // Detect spreadsheet
051        if (qName.equals("office:spreadsheet")) {
052            isSpreadSheet = true;
053        }
054
055        // Text element
056        if (qName.startsWith("text:")) {
057            dumpText = true;
058        }
059        // Heading (Writer only): add a new line.
060        // If the heading's outline level is > 1 (not the document title), add
061        // an extra new line.
062        if (qName.equals("text:h")) {
063            sb.append("\n");
064            String outlineLevelAtt = atts.getValue("text:outline-level");
065            if (!StringUtils.isEmpty(outlineLevelAtt)) {
066                int outlineLevel = -1;
067                try {
068                    outlineLevel = Integer.parseInt(outlineLevelAtt);
069                } catch (NumberFormatException nfe) {
070                    log.warn("Attribute 'text:outline-level' on element 'text:h' has a non integer value.");
071                }
072                if (outlineLevel > 1) {
073                    sb.append("\n");
074                }
075            }
076        }
077        // Paragraph: add a new line
078        if (!isSpreadSheet && qName.equals("text:p")) {
079            sb.append("\n");
080        }
081
082        // Page (Impress only): add a new line if not the first one
083        if (qName.equals("draw:page") && !"page1".equals(atts.getValue("draw:name"))) {
084            sb.append("\n");
085        }
086    }
087
088    @Override
089    public void characters(char[] ch, int start, int length) throws SAXException {
090        if (dumpText) {
091            String content = String.valueOf(ch, start, length);
092            sb.append(content);
093        }
094    }
095
096    @Override
097    public void endElement(String namespaceURI, String localName, String qName) throws SAXException {
098        path.pop();
099        if (path.isEmpty() || !path.lastElement().startsWith("text:")) {
100            dumpText = false;
101        }
102
103        // Specific separators for spreadsheets:
104        if (isSpreadSheet) {
105
106            // End of table row: add a blank line
107            if (qName.equals("table:table-row")) {
108                sb.append("\n\n");
109            }
110
111            // End of table cell: add a separator
112            if (qName.equals("table:table-cell")) {
113                sb.append(" ");
114            }
115
116            // End of paragraph: add a white space
117            if (qName.equals("text:p")) {
118                sb.append(" ");
119            }
120        }
121    }
122
123}