001/* 002 * (C) Copyright 2006-2012 Nuxeo SA (http://nuxeo.com/) and others. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 * 016 * Contributors: 017 * Nuxeo 018 * Antoine Taillefer 019 * 020 */ 021 022package org.nuxeo.ecm.core.convert.plugins.text.extractors; 023 024import java.io.ByteArrayInputStream; 025import java.io.IOException; 026import java.util.Iterator; 027import java.util.Set; 028import java.util.TreeSet; 029import java.util.zip.ZipEntry; 030import java.util.zip.ZipInputStream; 031 032import org.apache.commons.io.IOUtils; 033import org.apache.commons.logging.Log; 034import org.apache.commons.logging.LogFactory; 035import org.nuxeo.ecm.core.convert.plugins.text.extractors.presentation.PresentationSlide; 036import org.xml.sax.InputSource; 037import org.xml.sax.SAXException; 038import org.xml.sax.XMLReader; 039 040/** 041 * Pptx to text converter: parses the Open XML presentation document to read its content. 042 */ 043public class PPTX2TextConverter extends XmlZip2TextConverter { 044 045 protected static final Log log = LogFactory.getLog(PPTX2TextConverter.class); 046 047 private static final String PRESENTATION_SLIDE_ZIP_ENTRY_NAME_PREFIX = "ppt/slides/slide"; 048 049 @Override 050 protected void readXmlZipContent(ZipInputStream zis, XMLReader reader, StringBuilder sb) throws IOException, 051 SAXException { 052 053 Set<PresentationSlide> slides = new TreeSet<>(); 054 055 ZipEntry zipEntry = zis.getNextEntry(); 056 while (zipEntry != null) { 057 String zipEntryName = zipEntry.getName(); 058 if (zipEntryName.startsWith(PRESENTATION_SLIDE_ZIP_ENTRY_NAME_PREFIX) 059 && zipEntryName.length() > PRESENTATION_SLIDE_ZIP_ENTRY_NAME_PREFIX.length()) { 060 char slideNumberChar = zipEntryName.charAt(PRESENTATION_SLIDE_ZIP_ENTRY_NAME_PREFIX.length()); 061 int slideNumber = -1; 062 try { 063 slideNumber = Integer.parseInt(String.valueOf(slideNumberChar)); 064 } catch (NumberFormatException nfe) { 065 log.warn("Slide number is not an non integer, won't take this slide into account."); 066 } 067 if (slideNumber > -1) { 068 OpenXmlContentHandler contentHandler = new OpenXmlContentHandler(); 069 reader.setContentHandler(contentHandler); 070 reader.parse(new InputSource(new ByteArrayInputStream(IOUtils.toByteArray(zis)))); 071 slides.add(new PresentationSlide(contentHandler.getContent(), slideNumber)); 072 } 073 } 074 zipEntry = zis.getNextEntry(); 075 } 076 if (!slides.isEmpty()) { 077 Iterator<PresentationSlide> slidesIt = slides.iterator(); 078 while (slidesIt.hasNext()) { 079 PresentationSlide slide = slidesIt.next(); 080 sb.append(slide.getContent()); 081 sb.append("\n"); 082 } 083 } 084 } 085}