001/* 002 * (C) Copyright 2016 Nuxeo SA (http://nuxeo.com/) and others. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 * 016 */ 017package org.nuxeo.ecm.platform.importer.mqueues.pattern.producer; 018 019import org.apache.commons.logging.Log; 020import org.apache.commons.logging.LogFactory; 021import org.nuxeo.ecm.core.api.Blob; 022import org.nuxeo.ecm.core.api.Blobs; 023import org.nuxeo.ecm.platform.importer.mqueues.pattern.message.DocumentMessage; 024import org.nuxeo.ecm.platform.importer.random.HunspellDictionaryHolder; 025import org.nuxeo.ecm.platform.importer.random.RandomTextGenerator; 026 027import java.io.Serializable; 028import java.nio.file.Path; 029import java.util.ArrayList; 030import java.util.Collections; 031import java.util.HashMap; 032import java.util.List; 033import java.util.Random; 034import java.util.concurrent.ThreadLocalRandom; 035 036/** 037 * @since 9.1 038 */ 039public class RandomDocumentMessageProducer extends AbstractProducer<DocumentMessage> { 040 private static final Log log = LogFactory.getLog(RandomDocumentMessageProducer.class); 041 private final long nbDocuments; 042 private final RandomBlobInfoProvider blobInfoProvider; 043 private boolean countFolderAsDocument = true; 044 private int maxFoldersPerFolder = 50; 045 private int maxDocumentsPerFolder = 500; 046 private int blobSizeKB = 0; 047 private boolean blobOnlyText = false; 048 049 private int documentCount = 0; 050 private int folderCount = 0; 051 private final Random rand; 052 private static RandomTextGenerator gen; 053 054 static protected final String[] DC_NATURE = {"article", "acknowledgement", "assessment", "application", "order", 055 "contract", "quotation", "fax", "worksheet", "letter", "memo", "note", "notification", "procedure", 056 "report", "internshipReport", "pressReview"}; 057 058 static protected final String[] DC_SUBJECTS = {"art/architecture", "art/comics", "art/cinema", "art/culture", "art/danse", 059 "art/music", "sciences/astronomy", "sciences/biology", "sciences/chemistry", "sciences/math", 060 "sciences/physic", "society/ecology", "daily life/gastronomy", "daily life/gardening", "daily life/sport", 061 "technology/it"}; 062 063 static protected final String[] DC_RIGHTS = {"OpenContentL", "CC-BY-NC", "CC-BY-ND", "FreeArt", "ODbi", "GNUGPL", 064 "FreeBSD", "CC0"}; 065 066 static protected final String[] DC_LANGUAGE = {"IT", "DE", "FR", "US", "EN"}; 067 068 static protected final String[] DC_SOURCE = {"internal", "external", "unknown"}; 069 070 static protected final String[] DC_COVERAGE = {"europe/France", "europe/Germany", "europe/Italy", "europe/Spain", 071 "oceania/Tonga", "africa/Mali", "asia/Japan", "north-america/United_States_of_America"}; 072 private int foldersInCurrentFolderLimit; 073 private int documentInCurrentFolderLimit; 074 075 private enum DocType {Root, Folder, Document} 076 077 private DocType currentType = DocType.Root; 078 private int parentIndex = 0; 079 private List<String> parents = new ArrayList<>(); 080 private List<String> children = new ArrayList<>(); 081 private int documentInCurrentFolderCount = 0; 082 083 public RandomDocumentMessageProducer(int producerId, long nbDocuments, String lang, Path blobInfoDirectory) { 084 super(producerId); 085 this.nbDocuments = nbDocuments; 086 rand = ThreadLocalRandom.current(); 087 088 synchronized (RandomDocumentMessageProducer.class) { 089 if (gen == null) { 090 gen = new RandomTextGenerator(new HunspellDictionaryHolder(lang)); 091 gen.prefilCache(); 092 } 093 } 094 if (blobInfoDirectory != null) { 095 this.blobInfoProvider = new RandomBlobInfoProvider(blobInfoDirectory, producerId); 096 } else { 097 this.blobInfoProvider = null; 098 } 099 log.info("RandomDocumentMessageProducer created, nbDocuments: " + nbDocuments); 100 } 101 102 public RandomDocumentMessageProducer setMaxFoldersPerFolder(int max) { 103 maxFoldersPerFolder = max; 104 return this; 105 } 106 107 public RandomDocumentMessageProducer setMaxDocumentsPerFolder(int max) { 108 maxDocumentsPerFolder = max; 109 return this; 110 } 111 112 public RandomDocumentMessageProducer countFolderAsDocument(boolean value) { 113 countFolderAsDocument = value; 114 return this; 115 } 116 117 public RandomDocumentMessageProducer withBlob(int sizeKB, boolean onlyText) { 118 this.blobSizeKB = sizeKB; 119 this.blobOnlyText = onlyText; 120 return this; 121 } 122 123 124 @Override 125 public int getPartition(DocumentMessage message, int partitions) { 126 return getProducerId() % partitions; 127 } 128 129 @Override 130 public boolean hasNext() { 131 if (countFolderAsDocument) { 132 return (documentCount + folderCount) < nbDocuments; 133 } 134 return documentCount <= nbDocuments; 135 } 136 137 @Override 138 public DocumentMessage next() { 139 DocumentMessage ret; 140 switch (currentType) { 141 case Root: 142 ret = createRoot(); 143 parents.add(ret.getId()); 144 currentType = DocType.Folder; 145 foldersInCurrentFolderLimit = rand.nextInt(maxFoldersPerFolder) + 1; 146 break; 147 case Folder: 148 ret = createFolder(parents.get(parentIndex)); 149 children.add(ret.getId()); 150 if (children.size() >= foldersInCurrentFolderLimit) { 151 currentType = DocType.Document; 152 documentInCurrentFolderCount = 0; 153 documentInCurrentFolderLimit = rand.nextInt(maxDocumentsPerFolder); 154 } 155 break; 156 default: 157 case Document: 158 ret = createDocument(parents.get(parentIndex), children); 159 documentInCurrentFolderCount += 1; 160 if (documentInCurrentFolderCount > documentInCurrentFolderLimit) { 161 parentIndex += 1; 162 if (parentIndex >= parents.size()) { 163 parents.clear(); 164 parents = children; 165 children = new ArrayList<>(); 166 parentIndex = 0; 167 } 168 currentType = DocType.Folder; 169 foldersInCurrentFolderLimit = rand.nextInt(maxFoldersPerFolder) + 1; 170 } 171 break; 172 } 173 // log.debug(ret.getType() + ": " + ret.getId()); 174 return ret; 175 } 176 177 private DocumentMessage createRoot() { 178 folderCount++; 179 return getRandomNodeWithPrefix(String.format("%02d-", getProducerId()), "Folder", ""); 180 } 181 182 183 private DocumentMessage createFolder(String parentPath) { 184 DocumentMessage node = getRandomNode("Folder", parentPath, false); 185 folderCount++; 186 return node; 187 } 188 189 private DocumentMessage createDocument(String parentPath, List<String> exclude) { 190 DocumentMessage node = getRandomNode("File", parentPath, true); 191 String ret = node.getId(); 192 while (exclude.contains(ret)) { 193 log.debug("duplicate found"); 194 node = getRandomNode("File", parentPath, true); 195 ret = node.getId(); 196 } 197 documentCount++; 198 return node; 199 } 200 201 private DocumentMessage getRandomNode(String type, String parentPath, boolean withBlob) { 202 String title = getTitle(); 203 String name = getName(title); 204 HashMap<String, Serializable> props = getRandomProperties(title); 205 DocumentMessage.Builder builder = DocumentMessage.builder(type, parentPath, name).setProperties(props); 206 if (withBlob) { 207 if (blobInfoProvider != null) { 208 builder.setBlobInfo(blobInfoProvider.getBlobInfo(builder)); 209 } else { 210 builder.setBlob(getRandomBlob()); 211 } 212 } 213 return builder.build(); 214 } 215 216 private DocumentMessage getRandomNodeWithPrefix(String prefix, String type, String parentPath) { 217 String title = getTitle(); 218 String name = prefix + getName(title); 219 HashMap<String, Serializable> props = getRandomProperties(title); 220 DocumentMessage.Builder builder = DocumentMessage.builder(type, parentPath, name).setProperties(props); 221 if (blobInfoProvider != null) { 222 builder.setBlobInfo(blobInfoProvider.getBlobInfo(builder)); 223 } else { 224 builder.setBlob(getRandomBlob()); 225 } 226 return builder.build(); 227 } 228 229 private Blob getRandomBlob() { 230 if (blobSizeKB == 0) { 231 return null; 232 } 233 String content = gen.getRandomText(blobSizeKB); 234 return Blobs.createBlob(content, getBlobMimeType(), null, getName(getTitle()) + ".txt"); 235 } 236 237 private String getBlobMimeType() { 238 if (blobOnlyText) { 239 return "text/plain"; 240 } else { 241 return "text/partial"; 242 } 243 } 244 245 private String getName(String title) { 246 return title.replaceAll("\\W+", "-").toLowerCase(); 247 } 248 249 private String getTitle() { 250 return capitalize(gen.getRandomTitle(rand.nextInt(3) + 1).trim()); 251 // return "f" + folderCount; 252 } 253 254 private String capitalize(final String line) { 255 return Character.toUpperCase(line.charAt(0)) + line.substring(1); 256 } 257 258 protected HashMap<String, Serializable> getRandomProperties(String title) { 259 HashMap<String, Serializable> ret = new HashMap<>(); 260 ret.put("dc:title", title); 261 if (rand.nextInt(10) == 1) { 262 String description = gen.getRandomTitle(rand.nextInt(5) + 1); 263 ret.put("dc:description", capitalize(description)); 264 } 265 ret.put("dc:nature", getGaussian(DC_NATURE)); 266 ret.put("dc:subjects", (Serializable) Collections.singletonList(getGaussian(DC_SUBJECTS))); 267 ret.put("dc:rights", getGaussian(DC_RIGHTS)); 268 ret.put("dc:language", getGaussian(DC_LANGUAGE)); 269 ret.put("dc:coverage", getGaussian(DC_COVERAGE)); 270 ret.put("dc:source", getGaussian(DC_SOURCE)); 271 return ret; 272 } 273 274 protected String getGaussian(String[] words) { 275 double g = Math.abs(rand.nextGaussian() / 4); 276 g = Math.min(g, 1); 277 int i = (int) Math.floor(g * (words.length - 1)); 278 return words[i]; 279 } 280 281}