001/*
002 * (C) Copyright 2016 Nuxeo SA (http://nuxeo.com/) and others.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 *
016 */
017package org.nuxeo.importer.stream.producer;
018
019import java.io.Serializable;
020import java.util.ArrayList;
021import java.util.Collections;
022import java.util.HashMap;
023import java.util.HashSet;
024import java.util.List;
025import java.util.NoSuchElementException;
026import java.util.Random;
027import java.util.Set;
028import java.util.concurrent.ThreadLocalRandom;
029
030import org.apache.commons.logging.Log;
031import org.apache.commons.logging.LogFactory;
032import org.nuxeo.ecm.core.api.Blob;
033import org.nuxeo.ecm.core.api.Blobs;
034import org.nuxeo.ecm.core.blob.BlobInfo;
035import org.nuxeo.ecm.platform.importer.random.HunspellDictionaryHolder;
036import org.nuxeo.ecm.platform.importer.random.RandomTextGenerator;
037import org.nuxeo.importer.stream.message.DocumentMessage;
038import org.nuxeo.lib.stream.pattern.producer.AbstractProducer;
039
040/**
041 * @since 9.1
042 */
043public class RandomDocumentMessageProducer extends AbstractProducer<DocumentMessage> {
044    private static final Log log = LogFactory.getLog(RandomDocumentMessageProducer.class);
045
046    protected final long nbDocuments;
047
048    protected final BlobInfoFetcher blobInfoFetcher;
049
050    protected boolean countFolderAsDocument = true;
051
052    protected int maxFoldersPerFolder = 50;
053
054    protected int maxDocumentsPerFolder = 10000;
055
056    protected int blobSizeKB = 0;
057
058    protected boolean blobOnlyText = false;
059
060    protected int documentCount = 0;
061
062    protected int folderCount = 0;
063
064    protected final Random rand;
065
066    protected static RandomTextGenerator gen;
067
068    protected static final String[] DC_NATURE = { "article", "acknowledgement", "assessment", "application", "order",
069            "contract", "quotation", "fax", "worksheet", "letter", "memo", "note", "notification", "procedure",
070            "report", "internshipReport", "pressReview" };
071
072    protected static final String[] DC_SUBJECTS = { "art/architecture", "art/comics", "art/cinema", "art/culture",
073            "art/danse", "art/music", "sciences/astronomy", "sciences/biology", "sciences/chemistry", "sciences/math",
074            "sciences/physic", "society/ecology", "daily life/gastronomy", "daily life/gardening", "daily life/sport",
075            "technology/it" };
076
077    protected static final String[] DC_RIGHTS = { "OpenContentL", "CC-BY-NC", "CC-BY-ND", "FreeArt", "ODbi", "GNUGPL",
078            "FreeBSD", "CC0" };
079
080    protected static final String[] DC_LANGUAGE = { "IT", "DE", "FR", "US", "EN" };
081
082    protected static final String[] DC_SOURCE = { "internal", "external", "unknown" };
083
084    protected static final String[] DC_COVERAGE = { "europe/France", "europe/Germany", "europe/Italy", "europe/Spain",
085            "oceania/Tonga", "africa/Mali", "asia/Japan", "north-america/United_States_of_America" };
086
087    protected int foldersInCurrentFolderLimit;
088
089    protected int documentInCurrentFolderLimit;
090
091    protected enum DocType {
092        Root, Folder, Document
093    }
094
095    protected DocType currentType = DocType.Root;
096
097    protected int parentIndex = 0;
098
099    protected List<String> parents = new ArrayList<>();
100
101    protected List<String> folderishChildren = new ArrayList<>();
102
103    protected Set<String> children = new HashSet<>();
104
105    protected int documentInCurrentFolderCount = 0;
106
107    public RandomDocumentMessageProducer(int producerId, long nbDocuments, String lang,
108            BlobInfoFetcher blobInfoFetcher) {
109        super(producerId);
110        this.nbDocuments = nbDocuments;
111        rand = ThreadLocalRandom.current();
112
113        synchronized (RandomDocumentMessageProducer.class) {
114            if (gen == null) {
115                gen = new RandomTextGenerator(new HunspellDictionaryHolder(lang));
116                gen.prefilCache();
117            }
118        }
119        this.blobInfoFetcher = blobInfoFetcher;
120        log.info("RandomDocumentMessageProducer created, nbDocuments: " + nbDocuments);
121    }
122
123    public RandomDocumentMessageProducer setMaxFoldersPerFolder(int max) {
124        maxFoldersPerFolder = max;
125        return this;
126    }
127
128    public RandomDocumentMessageProducer setMaxDocumentsPerFolder(int max) {
129        maxDocumentsPerFolder = max;
130        return this;
131    }
132
133    public RandomDocumentMessageProducer countFolderAsDocument(boolean value) {
134        countFolderAsDocument = value;
135        return this;
136    }
137
138    public RandomDocumentMessageProducer withBlob(int sizeKB, boolean onlyText) {
139        this.blobSizeKB = sizeKB;
140        this.blobOnlyText = onlyText;
141        return this;
142    }
143
144    @Override
145    public int getPartition(DocumentMessage message, int partitions) {
146        return getProducerId() % partitions;
147    }
148
149    @Override
150    public boolean hasNext() {
151        if (countFolderAsDocument) {
152            return (documentCount + folderCount) < nbDocuments;
153        }
154        return documentCount < nbDocuments;
155    }
156
157    @Override
158    public DocumentMessage next() {
159        if (!hasNext()) {
160            throw new NoSuchElementException();
161        }
162        DocumentMessage ret;
163        switch (currentType) {
164        case Root:
165            ret = createRoot();
166            parents.add(ret.getId());
167            currentType = DocType.Folder;
168            foldersInCurrentFolderLimit = rand.nextInt(maxFoldersPerFolder) + 1;
169            break;
170        case Folder:
171            ret = createFolder(parents.get(parentIndex), children);
172            folderishChildren.add(ret.getId());
173            children.add(ret.getName());
174            if (folderishChildren.size() >= foldersInCurrentFolderLimit) {
175                currentType = DocType.Document;
176                documentInCurrentFolderCount = 0;
177                documentInCurrentFolderLimit = rand.nextInt(maxDocumentsPerFolder);
178            }
179            break;
180        default:
181        case Document:
182            ret = createDocument(parents.get(parentIndex), children);
183            children.add(ret.getName());
184            documentInCurrentFolderCount += 1;
185            if (documentInCurrentFolderCount > documentInCurrentFolderLimit) {
186                parentIndex += 1;
187                if (parentIndex >= parents.size()) {
188                    parents.clear();
189                    parents = folderishChildren;
190                    folderishChildren = new ArrayList<>();
191                    children = new HashSet<>();
192                    parentIndex = 0;
193                }
194                currentType = DocType.Folder;
195                foldersInCurrentFolderLimit = rand.nextInt(maxFoldersPerFolder) + 1;
196            }
197            break;
198        }
199        // log.debug(ret.getType() + ": " + ret.getId());
200        return ret;
201    }
202
203    protected DocumentMessage createRoot() {
204        folderCount++;
205        return getRandomNodeWithPrefix(String.format("%02d-", getProducerId()), "Folder", "");
206    }
207
208    protected DocumentMessage createFolder(String parentPath, Set<String> exclude) {
209        DocumentMessage node = getRandomNodeWithExclusion("Folder", parentPath, false, exclude);
210        folderCount++;
211        return node;
212    }
213
214    protected DocumentMessage createDocument(String parentPath, Set<String> exclude) {
215        DocumentMessage node = getRandomNodeWithExclusion("File", parentPath, true, exclude);
216        documentCount++;
217        return node;
218    }
219
220    protected DocumentMessage getRandomNodeWithExclusion(String type, String parentPath, boolean withBlob,
221            Set<String> exclude) {
222        DocumentMessage node = getRandomNode(type, parentPath, withBlob);
223        String name = node.getName();
224        if (exclude.contains(name)) {
225            String newName = name + "-" + rand.nextInt(exclude.size());
226            node = DocumentMessage.copy(node, newName);
227        }
228        return node;
229    }
230
231    protected DocumentMessage getRandomNode(String type, String parentPath, boolean withBlob) {
232        String title = getTitle();
233        String name = getName(title);
234        HashMap<String, Serializable> props = getRandomProperties(title);
235        DocumentMessage.Builder builder = DocumentMessage.builder(type, parentPath, name).setProperties(props);
236        if (withBlob) {
237            if (blobInfoFetcher != null) {
238                BlobInfo blobInfo = blobInfoFetcher.get(builder);
239                if (blobInfo != null) {
240                    builder.setBlobInfo(blobInfo);
241                    if (blobInfo.mimeType != null) {
242                        builder.setType(getDocumentTypeForMimeType(blobInfo.mimeType));
243                    }
244                }
245            } else {
246                builder.setBlob(getRandomBlob());
247            }
248        }
249        return builder.build();
250    }
251
252    protected String getDocumentTypeForMimeType(String mimeType) {
253        if (mimeType.startsWith("image")) {
254            return "Picture";
255        }
256        if (mimeType.startsWith("video")) {
257            return "Video";
258        }
259        return "File";
260    }
261
262    protected DocumentMessage getRandomNodeWithPrefix(String prefix, String type, String parentPath) {
263        String title = getTitle();
264        String name = prefix + getName(title);
265        HashMap<String, Serializable> props = getRandomProperties(title);
266        DocumentMessage.Builder builder = DocumentMessage.builder(type, parentPath, name).setProperties(props);
267        return builder.build();
268    }
269
270    protected Blob getRandomBlob() {
271        if (blobSizeKB == 0) {
272            return null;
273        }
274        String content = gen.getRandomText(blobSizeKB);
275        return Blobs.createBlob(content, getBlobMimeType(), null, getName(getTitle()) + ".txt");
276    }
277
278    protected String getBlobMimeType() {
279        if (blobOnlyText) {
280            return "text/plain";
281        } else {
282            return "text/partial";
283        }
284    }
285
286    protected String getName(String title) {
287        return title.replaceAll("\\W+", "-").toLowerCase();
288    }
289
290    protected String getTitle() {
291        return capitalize(gen.getRandomTitle(rand.nextInt(3) + 1).trim());
292        // return "f" + folderCount;
293    }
294
295    protected String capitalize(final String line) {
296        return Character.toUpperCase(line.charAt(0)) + line.substring(1);
297    }
298
299    protected HashMap<String, Serializable> getRandomProperties(String title) {
300        HashMap<String, Serializable> ret = new HashMap<>();
301        ret.put("dc:title", title);
302        if (rand.nextInt(10) == 1) {
303            String description = gen.getRandomTitle(rand.nextInt(5) + 1);
304            ret.put("dc:description", capitalize(description));
305        }
306        ret.put("dc:nature", getGaussian(DC_NATURE));
307        ret.put("dc:subjects", (Serializable) Collections.singletonList(getGaussian(DC_SUBJECTS)));
308        ret.put("dc:rights", getGaussian(DC_RIGHTS));
309        ret.put("dc:language", getGaussian(DC_LANGUAGE));
310        ret.put("dc:coverage", getGaussian(DC_COVERAGE));
311        ret.put("dc:source", getGaussian(DC_SOURCE));
312        return ret;
313    }
314
315    protected String getGaussian(String[] words) {
316        double g = Math.abs(rand.nextGaussian() / 4);
317        g = Math.min(g, 1);
318        int i = (int) Math.floor(g * (words.length - 1));
319        return words[i];
320    }
321
322    @Override
323    public void close() throws Exception {
324        super.close();
325        if (blobInfoFetcher != null) {
326            blobInfoFetcher.close();
327        }
328    }
329
330}