001/*
002 * (C) Copyright 2006-2012 Nuxeo SA (http://nuxeo.com/) and others.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 *
016 * Contributors:
017 *     Nuxeo
018 *     Antoine Taillefer
019 */
020
021package org.nuxeo.ecm.core.convert.plugins.text.extractors;
022
023import java.util.Stack;
024
025import org.apache.commons.lang.StringUtils;
026import org.apache.commons.logging.Log;
027import org.apache.commons.logging.LogFactory;
028import org.xml.sax.Attributes;
029import org.xml.sax.SAXException;
030import org.xml.sax.helpers.DefaultHandler;
031
032public class OOoXmlContentHandler extends DefaultHandler {
033
034    protected static final Log log = LogFactory.getLog(OOoXmlContentHandler.class);
035
036    protected final StringBuffer sb = new StringBuffer();
037
038    protected final Stack<String> path = new Stack<String>();
039
040    protected boolean dumpText = false;
041
042    protected boolean isSpreadSheet = false;
043
044    public String getContent() {
045        return sb.toString();
046    }
047
048    @Override
049    public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException {
050        path.push(qName);
051
052        // Detect spreadsheet
053        if (qName.equals("office:spreadsheet")) {
054            isSpreadSheet = true;
055        }
056
057        // Text element
058        if (qName.startsWith("text:")) {
059            dumpText = true;
060        }
061        // Heading (Writer only): add a new line.
062        // If the heading's outline level is > 1 (not the document title), add
063        // an extra new line.
064        if (qName.equals("text:h")) {
065            sb.append("\n");
066            String outlineLevelAtt = atts.getValue("text:outline-level");
067            if (!StringUtils.isEmpty(outlineLevelAtt)) {
068                int outlineLevel = -1;
069                try {
070                    outlineLevel = Integer.parseInt(outlineLevelAtt);
071                } catch (NumberFormatException nfe) {
072                    log.warn("Attribute 'text:outline-level' on element 'text:h' has a non integer value.");
073                }
074                if (outlineLevel > 1) {
075                    sb.append("\n");
076                }
077            }
078        }
079        // Paragraph: add a new line
080        if (!isSpreadSheet && qName.equals("text:p")) {
081            sb.append("\n");
082        }
083
084        // Page (Impress only): add a new line if not the first one
085        if (qName.equals("draw:page") && !"page1".equals(atts.getValue("draw:name"))) {
086            sb.append("\n");
087        }
088    }
089
090    @Override
091    public void characters(char[] ch, int start, int length) throws SAXException {
092        if (dumpText) {
093            String content = String.valueOf(ch, start, length);
094            sb.append(content);
095        }
096    }
097
098    @Override
099    public void endElement(String namespaceURI, String localName, String qName) throws SAXException {
100        path.pop();
101        if (path.isEmpty() || !path.lastElement().startsWith("text:")) {
102            dumpText = false;
103        }
104
105        // Specific separators for spreadsheets:
106        if (isSpreadSheet) {
107
108            // End of table row: add a blank line
109            if (qName.equals("table:table-row")) {
110                sb.append("\n\n");
111            }
112
113            // End of table cell: add a separator
114            if (qName.equals("table:table-cell")) {
115                sb.append(" ");
116            }
117
118            // End of paragraph: add a white space
119            if (qName.equals("text:p")) {
120                sb.append(" ");
121            }
122        }
123    }
124
125}