001/*
002 * (C) Copyright 2016 Nuxeo SA (http://nuxeo.com/) and others.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 *
016 * Contributors:
017 *     Nuxeo
018 */
019
020package org.nuxeo.apidoc.worker;
021
022import java.io.IOException;
023import java.util.HashSet;
024import java.util.Set;
025import java.util.stream.IntStream;
026
027import javax.xml.parsers.ParserConfigurationException;
028import javax.xml.parsers.SAXParser;
029import javax.xml.parsers.SAXParserFactory;
030
031import org.apache.commons.lang3.StringUtils;
032import org.nuxeo.apidoc.listener.AttributesExtractorStater;
033import org.nuxeo.ecm.core.api.Blob;
034import org.nuxeo.ecm.core.api.DocumentModel;
035import org.nuxeo.ecm.core.api.DocumentNotFoundException;
036import org.nuxeo.ecm.core.api.DocumentRef;
037import org.nuxeo.ecm.core.api.NuxeoException;
038import org.nuxeo.ecm.core.api.blobholder.BlobHolder;
039import org.nuxeo.ecm.core.work.AbstractWork;
040import org.nuxeo.ecm.platform.dublincore.listener.DublinCoreListener;
041import org.xml.sax.Attributes;
042import org.xml.sax.SAXException;
043import org.xml.sax.helpers.DefaultHandler;
044
045/**
046 * @author <a href="mailto:ak@nuxeo.com">Arnaud Kervern</a>
047 * @since 8.3
048 */
049public class ExtractXmlAttributesWorker extends AbstractWork {
050
051    private static final long serialVersionUID = 1L;
052
053    public static final String CATEGORY = "apidoc-xml-extractor";
054
055    protected ExtractXmlAttributesWorker(String repositoryName, String docId) {
056        super(String.format("%s:%s:xml:extractor", repositoryName, docId));
057        setDocument(repositoryName, docId);
058    }
059
060    public ExtractXmlAttributesWorker(String repositoryName, String originatingUsername, String docId) {
061        this(repositoryName, docId);
062        setOriginatingUsername(originatingUsername);
063    }
064
065    @Override
066    public void work() {
067        setStatus("Extracting");
068        openSystemSession();
069
070        try {
071            DocumentModel doc = loadDocument();
072            BlobHolder adapter = doc.getAdapter(BlobHolder.class);
073            String attributes = extractAttributes(adapter.getBlob());
074            doc.setPropertyValue(AttributesExtractorStater.ATTRIBUTES_PROPERTY, attributes);
075
076            session.saveDocument(doc);
077
078            setStatus("Done");
079        } catch (DocumentNotFoundException cause) {
080            ;
081        } catch (IOException | ParserConfigurationException | SAXException e) {
082            setStatus("Failed");
083            throw new NuxeoException(e);
084        }
085    }
086
087    protected DocumentModel loadDocument() throws DocumentNotFoundException {
088        final DocumentRef docRef = getDocument().getDocRef();
089        DocumentModel doc = session.getDocument(docRef);
090        doc.putContextData(DublinCoreListener.DISABLE_DUBLINCORE_LISTENER, true);
091        return doc;
092    }
093
094    public String extractAttributes(Blob blob) throws ParserConfigurationException, SAXException, IOException {
095        if (blob == null) {
096            return null;
097        }
098
099        SAXParserFactory factory = SAXParserFactory.newInstance();
100        SAXParser saxParser = factory.newSAXParser();
101
102        Set<String> attributes = new HashSet<>();
103        saxParser.parse(blob.getStream(), new Handler(attributes));
104
105        return StringUtils.join(attributes, ' ');
106    }
107
108    @Override
109    public String getCategory() {
110        return CATEGORY;
111    }
112
113    @Override
114    public String getTitle() {
115        return "XML Attributes extractor for fulltext search";
116    }
117
118    protected static class Handler extends DefaultHandler {
119        private Set<String> attributesSet;
120
121        public Handler(Set<String> attributesSet) {
122            this.attributesSet = attributesSet;
123        }
124
125        @Override
126        public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
127            super.startElement(uri, localName, qName, attributes);
128            IntStream.range(0, attributes.getLength()).forEach(i -> attributesSet.add(attributes.getValue(i)));
129        }
130    }
131
132}