001/*
002 * (C) Copyright 2006-2012 Nuxeo SA (http://nuxeo.com/) and others.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 *
016 * Contributors:
017 *     Nuxeo
018 *     Antoine Taillefer
019 *
020 */
021
022package org.nuxeo.ecm.core.convert.plugins.text.extractors;
023
024import java.io.ByteArrayInputStream;
025import java.io.IOException;
026import java.util.Iterator;
027import java.util.Set;
028import java.util.TreeSet;
029import java.util.zip.ZipEntry;
030import java.util.zip.ZipInputStream;
031
032import org.apache.commons.io.IOUtils;
033import org.apache.commons.logging.Log;
034import org.apache.commons.logging.LogFactory;
035import org.nuxeo.ecm.core.convert.plugins.text.extractors.presentation.PresentationSlide;
036import org.xml.sax.InputSource;
037import org.xml.sax.SAXException;
038import org.xml.sax.XMLReader;
039
040/**
041 * Pptx to text converter: parses the Open XML presentation document to read its content.
042 */
043public class PPTX2TextConverter extends XmlZip2TextConverter {
044
045    protected static final Log log = LogFactory.getLog(PPTX2TextConverter.class);
046
047    private static final String PRESENTATION_SLIDE_ZIP_ENTRY_NAME_PREFIX = "ppt/slides/slide";
048
049    @Override
050    protected void readXmlZipContent(ZipInputStream zis, XMLReader reader, StringBuilder sb) throws IOException,
051            SAXException {
052
053        Set<PresentationSlide> slides = new TreeSet<>();
054
055        ZipEntry zipEntry = zis.getNextEntry();
056        while (zipEntry != null) {
057            String zipEntryName = zipEntry.getName();
058            if (zipEntryName.startsWith(PRESENTATION_SLIDE_ZIP_ENTRY_NAME_PREFIX)
059                    && zipEntryName.length() > PRESENTATION_SLIDE_ZIP_ENTRY_NAME_PREFIX.length()) {
060                char slideNumberChar = zipEntryName.charAt(PRESENTATION_SLIDE_ZIP_ENTRY_NAME_PREFIX.length());
061                int slideNumber = -1;
062                try {
063                    slideNumber = Integer.parseInt(String.valueOf(slideNumberChar));
064                } catch (NumberFormatException nfe) {
065                    log.warn("Slide number is not an non integer, won't take this slide into account.");
066                }
067                if (slideNumber > -1) {
068                    OpenXmlContentHandler contentHandler = new OpenXmlContentHandler();
069                    reader.setContentHandler(contentHandler);
070                    reader.parse(new InputSource(new ByteArrayInputStream(IOUtils.toByteArray(zis))));
071                    slides.add(new PresentationSlide(contentHandler.getContent(), slideNumber));
072                }
073            }
074            zipEntry = zis.getNextEntry();
075        }
076        if (!slides.isEmpty()) {
077            Iterator<PresentationSlide> slidesIt = slides.iterator();
078            while (slidesIt.hasNext()) {
079                PresentationSlide slide = slidesIt.next();
080                sb.append(slide.getContent());
081                sb.append("\n");
082            }
083        }
084    }
085}