001/*
002 * (C) Copyright 2006-2012 Nuxeo SAS (http://nuxeo.com/) and contributors.
003 *
004 * All rights reserved. This program and the accompanying materials
005 * are made available under the terms of the GNU Lesser General Public License
006 * (LGPL) version 2.1 which accompanies this distribution, and is available at
007 * http://www.gnu.org/licenses/lgpl.html
008 *
009 * This library is distributed in the hope that it will be useful,
010 * but WITHOUT ANY WARRANTY; without even the implied warranty of
011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
012 * Lesser General Public License for more details.
013 *
014 * Contributors:
015 *     Nuxeo
016 *     Antoine Taillefer
017 *
018 */
019
020package org.nuxeo.ecm.core.convert.plugins.text.extractors;
021
022import java.io.ByteArrayInputStream;
023import java.io.IOException;
024import java.util.Iterator;
025import java.util.Set;
026import java.util.TreeSet;
027import java.util.zip.ZipEntry;
028import java.util.zip.ZipInputStream;
029
030import org.apache.commons.io.IOUtils;
031import org.apache.commons.logging.Log;
032import org.apache.commons.logging.LogFactory;
033import org.nuxeo.ecm.core.convert.plugins.text.extractors.presentation.PresentationSlide;
034import org.xml.sax.InputSource;
035import org.xml.sax.SAXException;
036import org.xml.sax.XMLReader;
037
038/**
039 * Pptx to text converter: parses the Open XML presentation document to read its content.
040 */
041public class PPTX2TextConverter extends XmlZip2TextConverter {
042
043    protected static final Log log = LogFactory.getLog(PPTX2TextConverter.class);
044
045    private static final String PRESENTATION_SLIDE_ZIP_ENTRY_NAME_PREFIX = "ppt/slides/slide";
046
047    protected void readXmlZipContent(ZipInputStream zis, XMLReader reader, StringBuilder sb) throws IOException,
048            SAXException {
049
050        Set<PresentationSlide> slides = new TreeSet<PresentationSlide>();
051
052        ZipEntry zipEntry = zis.getNextEntry();
053        while (zipEntry != null) {
054            String zipEntryName = zipEntry.getName();
055            if (zipEntryName.startsWith(PRESENTATION_SLIDE_ZIP_ENTRY_NAME_PREFIX)
056                    && zipEntryName.length() > PRESENTATION_SLIDE_ZIP_ENTRY_NAME_PREFIX.length()) {
057                char slideNumberChar = zipEntryName.charAt(PRESENTATION_SLIDE_ZIP_ENTRY_NAME_PREFIX.length());
058                int slideNumber = -1;
059                try {
060                    slideNumber = Integer.parseInt(String.valueOf(slideNumberChar));
061                } catch (NumberFormatException nfe) {
062                    log.warn("Slide number is not an non integer, won't take this slide into account.");
063                }
064                if (slideNumber > -1) {
065                    OpenXmlContentHandler contentHandler = new OpenXmlContentHandler();
066                    reader.setContentHandler(contentHandler);
067                    reader.parse(new InputSource(new ByteArrayInputStream(IOUtils.toByteArray(zis))));
068                    slides.add(new PresentationSlide(contentHandler.getContent(), slideNumber));
069                }
070            }
071            zipEntry = zis.getNextEntry();
072        }
073        if (!slides.isEmpty()) {
074            Iterator<PresentationSlide> slidesIt = slides.iterator();
075            while (slidesIt.hasNext()) {
076                PresentationSlide slide = slidesIt.next();
077                sb.append(slide.getContent());
078                sb.append("\n");
079            }
080        }
081    }
082}