001/*
002 * (C) Copyright 2016 Nuxeo SA (http://nuxeo.com/) and others.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 *
016 */
017package org.nuxeo.importer.stream.producer;
018
019import java.io.Serializable;
020import java.util.ArrayList;
021import java.util.Collections;
022import java.util.HashMap;
023import java.util.HashSet;
024import java.util.List;
025import java.util.Random;
026import java.util.Set;
027import java.util.concurrent.ThreadLocalRandom;
028
029import org.apache.commons.logging.Log;
030import org.apache.commons.logging.LogFactory;
031import org.nuxeo.ecm.core.api.Blob;
032import org.nuxeo.ecm.core.api.Blobs;
033import org.nuxeo.ecm.core.blob.BlobInfo;
034import org.nuxeo.ecm.platform.importer.random.HunspellDictionaryHolder;
035import org.nuxeo.ecm.platform.importer.random.RandomTextGenerator;
036import org.nuxeo.importer.stream.message.DocumentMessage;
037import org.nuxeo.lib.stream.pattern.producer.AbstractProducer;
038
039/**
040 * @since 9.1
041 */
042public class RandomDocumentMessageProducer extends AbstractProducer<DocumentMessage> {
043    private static final Log log = LogFactory.getLog(RandomDocumentMessageProducer.class);
044
045    protected final long nbDocuments;
046
047    protected final BlobInfoFetcher blobInfoFetcher;
048
049    protected boolean countFolderAsDocument = true;
050
051    protected int maxFoldersPerFolder = 50;
052
053    protected int maxDocumentsPerFolder = 10000;
054
055    protected int blobSizeKB = 0;
056
057    protected boolean blobOnlyText = false;
058
059    protected int documentCount = 0;
060
061    protected int folderCount = 0;
062
063    protected final Random rand;
064
065    protected static RandomTextGenerator gen;
066
067    protected static final String[] DC_NATURE = { "article", "acknowledgement", "assessment", "application", "order",
068            "contract", "quotation", "fax", "worksheet", "letter", "memo", "note", "notification", "procedure",
069            "report", "internshipReport", "pressReview" };
070
071    protected static final String[] DC_SUBJECTS = { "art/architecture", "art/comics", "art/cinema", "art/culture",
072            "art/danse", "art/music", "sciences/astronomy", "sciences/biology", "sciences/chemistry", "sciences/math",
073            "sciences/physic", "society/ecology", "daily life/gastronomy", "daily life/gardening", "daily life/sport",
074            "technology/it" };
075
076    protected static final String[] DC_RIGHTS = { "OpenContentL", "CC-BY-NC", "CC-BY-ND", "FreeArt", "ODbi", "GNUGPL",
077            "FreeBSD", "CC0" };
078
079    protected static final String[] DC_LANGUAGE = { "IT", "DE", "FR", "US", "EN" };
080
081    protected static final String[] DC_SOURCE = { "internal", "external", "unknown" };
082
083    protected static final String[] DC_COVERAGE = { "europe/France", "europe/Germany", "europe/Italy", "europe/Spain",
084            "oceania/Tonga", "africa/Mali", "asia/Japan", "north-america/United_States_of_America" };
085
086    protected int foldersInCurrentFolderLimit;
087
088    protected int documentInCurrentFolderLimit;
089
090    protected enum DocType {
091        Root, Folder, Document
092    }
093
094    protected DocType currentType = DocType.Root;
095
096    protected int parentIndex = 0;
097
098    protected List<String> parents = new ArrayList<>();
099
100    protected List<String> folderishChildren = new ArrayList<>();
101
102    protected Set<String> children = new HashSet<>();
103
104    protected int documentInCurrentFolderCount = 0;
105
106    public RandomDocumentMessageProducer(int producerId, long nbDocuments, String lang,
107            BlobInfoFetcher blobInfoFetcher) {
108        super(producerId);
109        this.nbDocuments = nbDocuments;
110        rand = ThreadLocalRandom.current();
111
112        synchronized (RandomDocumentMessageProducer.class) {
113            if (gen == null) {
114                gen = new RandomTextGenerator(new HunspellDictionaryHolder(lang));
115                gen.prefilCache();
116            }
117        }
118        this.blobInfoFetcher = blobInfoFetcher;
119        log.info("RandomDocumentMessageProducer created, nbDocuments: " + nbDocuments);
120    }
121
122    public RandomDocumentMessageProducer setMaxFoldersPerFolder(int max) {
123        maxFoldersPerFolder = max;
124        return this;
125    }
126
127    public RandomDocumentMessageProducer setMaxDocumentsPerFolder(int max) {
128        maxDocumentsPerFolder = max;
129        return this;
130    }
131
132    public RandomDocumentMessageProducer countFolderAsDocument(boolean value) {
133        countFolderAsDocument = value;
134        return this;
135    }
136
137    public RandomDocumentMessageProducer withBlob(int sizeKB, boolean onlyText) {
138        this.blobSizeKB = sizeKB;
139        this.blobOnlyText = onlyText;
140        return this;
141    }
142
143    @Override
144    public int getPartition(DocumentMessage message, int partitions) {
145        return getProducerId() % partitions;
146    }
147
148    @Override
149    public boolean hasNext() {
150        if (countFolderAsDocument) {
151            return (documentCount + folderCount) < nbDocuments;
152        }
153        return documentCount < nbDocuments;
154    }
155
156    @Override
157    public DocumentMessage next() {
158        DocumentMessage ret;
159        switch (currentType) {
160        case Root:
161            ret = createRoot();
162            parents.add(ret.getId());
163            currentType = DocType.Folder;
164            foldersInCurrentFolderLimit = rand.nextInt(maxFoldersPerFolder) + 1;
165            break;
166        case Folder:
167            ret = createFolder(parents.get(parentIndex), children);
168            folderishChildren.add(ret.getId());
169            children.add(ret.getName());
170            if (folderishChildren.size() >= foldersInCurrentFolderLimit) {
171                currentType = DocType.Document;
172                documentInCurrentFolderCount = 0;
173                documentInCurrentFolderLimit = rand.nextInt(maxDocumentsPerFolder);
174            }
175            break;
176        default:
177        case Document:
178            ret = createDocument(parents.get(parentIndex), children);
179            children.add(ret.getName());
180            documentInCurrentFolderCount += 1;
181            if (documentInCurrentFolderCount > documentInCurrentFolderLimit) {
182                parentIndex += 1;
183                if (parentIndex >= parents.size()) {
184                    parents.clear();
185                    parents = folderishChildren;
186                    folderishChildren = new ArrayList<>();
187                    children = new HashSet<>();
188                    parentIndex = 0;
189                }
190                currentType = DocType.Folder;
191                foldersInCurrentFolderLimit = rand.nextInt(maxFoldersPerFolder) + 1;
192            }
193            break;
194        }
195        // log.debug(ret.getType() + ": " + ret.getId());
196        return ret;
197    }
198
199    protected DocumentMessage createRoot() {
200        folderCount++;
201        return getRandomNodeWithPrefix(String.format("%02d-", getProducerId()), "Folder", "");
202    }
203
204    protected DocumentMessage createFolder(String parentPath, Set<String> exclude) {
205        DocumentMessage node = getRandomNodeWithExclusion("Folder", parentPath, false, exclude);
206        folderCount++;
207        return node;
208    }
209
210    protected DocumentMessage createDocument(String parentPath, Set<String> exclude) {
211        DocumentMessage node = getRandomNodeWithExclusion("File", parentPath, true, exclude);
212        documentCount++;
213        return node;
214    }
215
216    protected DocumentMessage getRandomNodeWithExclusion(String type, String parentPath, boolean withBlob,
217            Set<String> exclude) {
218        DocumentMessage node = getRandomNode(type, parentPath, withBlob);
219        String name = node.getName();
220        if (exclude.contains(name)) {
221            String newName = name + "-" + rand.nextInt(exclude.size());
222            node = DocumentMessage.copy(node, newName);
223        }
224        return node;
225    }
226
227    protected DocumentMessage getRandomNode(String type, String parentPath, boolean withBlob) {
228        String title = getTitle();
229        String name = getName(title);
230        HashMap<String, Serializable> props = getRandomProperties(title);
231        DocumentMessage.Builder builder = DocumentMessage.builder(type, parentPath, name).setProperties(props);
232        if (withBlob) {
233            if (blobInfoFetcher != null) {
234                BlobInfo blobInfo = blobInfoFetcher.get(builder);
235                if (blobInfo != null) {
236                    builder.setBlobInfo(blobInfo);
237                    if (blobInfo.mimeType != null) {
238                        builder.setType(getDocumentTypeForMimeType(blobInfo.mimeType));
239                    }
240                }
241            } else {
242                builder.setBlob(getRandomBlob());
243            }
244        }
245        return builder.build();
246    }
247
248    protected String getDocumentTypeForMimeType(String mimeType) {
249        if (mimeType.startsWith("image")) {
250            return "Picture";
251        }
252        if (mimeType.startsWith("video")) {
253            return "Video";
254        }
255        return "File";
256    }
257
258    protected DocumentMessage getRandomNodeWithPrefix(String prefix, String type, String parentPath) {
259        String title = getTitle();
260        String name = prefix + getName(title);
261        HashMap<String, Serializable> props = getRandomProperties(title);
262        DocumentMessage.Builder builder = DocumentMessage.builder(type, parentPath, name).setProperties(props);
263        return builder.build();
264    }
265
266    protected Blob getRandomBlob() {
267        if (blobSizeKB == 0) {
268            return null;
269        }
270        String content = gen.getRandomText(blobSizeKB);
271        return Blobs.createBlob(content, getBlobMimeType(), null, getName(getTitle()) + ".txt");
272    }
273
274    protected String getBlobMimeType() {
275        if (blobOnlyText) {
276            return "text/plain";
277        } else {
278            return "text/partial";
279        }
280    }
281
282    protected String getName(String title) {
283        return title.replaceAll("\\W+", "-").toLowerCase();
284    }
285
286    protected String getTitle() {
287        return capitalize(gen.getRandomTitle(rand.nextInt(3) + 1).trim());
288        // return "f" + folderCount;
289    }
290
291    protected String capitalize(final String line) {
292        return Character.toUpperCase(line.charAt(0)) + line.substring(1);
293    }
294
295    protected HashMap<String, Serializable> getRandomProperties(String title) {
296        HashMap<String, Serializable> ret = new HashMap<>();
297        ret.put("dc:title", title);
298        if (rand.nextInt(10) == 1) {
299            String description = gen.getRandomTitle(rand.nextInt(5) + 1);
300            ret.put("dc:description", capitalize(description));
301        }
302        ret.put("dc:nature", getGaussian(DC_NATURE));
303        ret.put("dc:subjects", (Serializable) Collections.singletonList(getGaussian(DC_SUBJECTS)));
304        ret.put("dc:rights", getGaussian(DC_RIGHTS));
305        ret.put("dc:language", getGaussian(DC_LANGUAGE));
306        ret.put("dc:coverage", getGaussian(DC_COVERAGE));
307        ret.put("dc:source", getGaussian(DC_SOURCE));
308        return ret;
309    }
310
311    protected String getGaussian(String[] words) {
312        double g = Math.abs(rand.nextGaussian() / 4);
313        g = Math.min(g, 1);
314        int i = (int) Math.floor(g * (words.length - 1));
315        return words[i];
316    }
317
318    @Override
319    public void close() throws Exception {
320        super.close();
321        if (blobInfoFetcher != null) {
322            blobInfoFetcher.close();
323        }
324    }
325
326}