001/* 002 * (C) Copyright 2016 Nuxeo SA (http://nuxeo.com/) and others. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 * 016 */ 017package org.nuxeo.importer.stream.producer; 018 019import java.io.Serializable; 020import java.util.ArrayList; 021import java.util.Collections; 022import java.util.HashMap; 023import java.util.HashSet; 024import java.util.List; 025import java.util.NoSuchElementException; 026import java.util.Random; 027import java.util.Set; 028import java.util.concurrent.ThreadLocalRandom; 029 030import org.apache.commons.logging.Log; 031import org.apache.commons.logging.LogFactory; 032import org.nuxeo.ecm.core.api.Blob; 033import org.nuxeo.ecm.core.api.Blobs; 034import org.nuxeo.ecm.core.blob.BlobInfo; 035import org.nuxeo.ecm.platform.importer.random.HunspellDictionaryHolder; 036import org.nuxeo.ecm.platform.importer.random.RandomTextGenerator; 037import org.nuxeo.importer.stream.message.DocumentMessage; 038import org.nuxeo.lib.stream.pattern.producer.AbstractProducer; 039 040/** 041 * @since 9.1 042 */ 043public class RandomDocumentMessageProducer extends AbstractProducer<DocumentMessage> { 044 private static final Log log = LogFactory.getLog(RandomDocumentMessageProducer.class); 045 046 protected final long nbDocuments; 047 048 protected final BlobInfoFetcher blobInfoFetcher; 049 050 protected boolean countFolderAsDocument = true; 051 052 protected int maxFoldersPerFolder = 50; 053 054 protected int maxDocumentsPerFolder = 10000; 055 056 protected int blobSizeKB = 0; 057 058 protected boolean blobOnlyText = false; 059 060 protected int documentCount = 0; 061 062 protected int folderCount = 0; 063 064 protected final Random rand; 065 066 protected static RandomTextGenerator gen; 067 068 protected static final String[] DC_NATURE = { "article", "acknowledgement", "assessment", "application", "order", 069 "contract", "quotation", "fax", "worksheet", "letter", "memo", "note", "notification", "procedure", 070 "report", "internshipReport", "pressReview" }; 071 072 protected static final String[] DC_SUBJECTS = { "art/architecture", "art/comics", "art/cinema", "art/culture", 073 "art/danse", "art/music", "sciences/astronomy", "sciences/biology", "sciences/chemistry", "sciences/math", 074 "sciences/physic", "society/ecology", "daily life/gastronomy", "daily life/gardening", "daily life/sport", 075 "technology/it" }; 076 077 protected static final String[] DC_RIGHTS = { "OpenContentL", "CC-BY-NC", "CC-BY-ND", "FreeArt", "ODbi", "GNUGPL", 078 "FreeBSD", "CC0" }; 079 080 protected static final String[] DC_LANGUAGE = { "IT", "DE", "FR", "US", "EN" }; 081 082 protected static final String[] DC_SOURCE = { "internal", "external", "unknown" }; 083 084 protected static final String[] DC_COVERAGE = { "europe/France", "europe/Germany", "europe/Italy", "europe/Spain", 085 "oceania/Tonga", "africa/Mali", "asia/Japan", "north-america/United_States_of_America" }; 086 087 protected int foldersInCurrentFolderLimit; 088 089 protected int documentInCurrentFolderLimit; 090 091 protected enum DocType { 092 Root, Folder, Document 093 } 094 095 protected DocType currentType = DocType.Root; 096 097 protected int parentIndex = 0; 098 099 protected List<String> parents = new ArrayList<>(); 100 101 protected List<String> folderishChildren = new ArrayList<>(); 102 103 protected Set<String> children = new HashSet<>(); 104 105 protected int documentInCurrentFolderCount = 0; 106 107 public RandomDocumentMessageProducer(int producerId, long nbDocuments, String lang, 108 BlobInfoFetcher blobInfoFetcher) { 109 super(producerId); 110 this.nbDocuments = nbDocuments; 111 rand = ThreadLocalRandom.current(); 112 113 synchronized (RandomDocumentMessageProducer.class) { 114 if (gen == null) { 115 gen = new RandomTextGenerator(new HunspellDictionaryHolder(lang)); 116 gen.prefilCache(); 117 } 118 } 119 this.blobInfoFetcher = blobInfoFetcher; 120 log.info("RandomDocumentMessageProducer created, nbDocuments: " + nbDocuments); 121 } 122 123 public RandomDocumentMessageProducer setMaxFoldersPerFolder(int max) { 124 maxFoldersPerFolder = max; 125 return this; 126 } 127 128 public RandomDocumentMessageProducer setMaxDocumentsPerFolder(int max) { 129 maxDocumentsPerFolder = max; 130 return this; 131 } 132 133 public RandomDocumentMessageProducer countFolderAsDocument(boolean value) { 134 countFolderAsDocument = value; 135 return this; 136 } 137 138 public RandomDocumentMessageProducer withBlob(int sizeKB, boolean onlyText) { 139 this.blobSizeKB = sizeKB; 140 this.blobOnlyText = onlyText; 141 return this; 142 } 143 144 @Override 145 public int getPartition(DocumentMessage message, int partitions) { 146 return getProducerId() % partitions; 147 } 148 149 @Override 150 public boolean hasNext() { 151 if (countFolderAsDocument) { 152 return (documentCount + folderCount) < nbDocuments; 153 } 154 return documentCount < nbDocuments; 155 } 156 157 @Override 158 public DocumentMessage next() { 159 if (!hasNext()) { 160 throw new NoSuchElementException(); 161 } 162 DocumentMessage ret; 163 switch (currentType) { 164 case Root: 165 ret = createRoot(); 166 parents.add(ret.getId()); 167 currentType = DocType.Folder; 168 foldersInCurrentFolderLimit = rand.nextInt(maxFoldersPerFolder) + 1; 169 break; 170 case Folder: 171 ret = createFolder(parents.get(parentIndex), children); 172 folderishChildren.add(ret.getId()); 173 children.add(ret.getName()); 174 if (folderishChildren.size() >= foldersInCurrentFolderLimit) { 175 currentType = DocType.Document; 176 documentInCurrentFolderCount = 0; 177 documentInCurrentFolderLimit = rand.nextInt(maxDocumentsPerFolder); 178 } 179 break; 180 default: 181 case Document: 182 ret = createDocument(parents.get(parentIndex), children); 183 children.add(ret.getName()); 184 documentInCurrentFolderCount += 1; 185 if (documentInCurrentFolderCount > documentInCurrentFolderLimit) { 186 parentIndex += 1; 187 if (parentIndex >= parents.size()) { 188 parents.clear(); 189 parents = folderishChildren; 190 folderishChildren = new ArrayList<>(); 191 children = new HashSet<>(); 192 parentIndex = 0; 193 } 194 currentType = DocType.Folder; 195 foldersInCurrentFolderLimit = rand.nextInt(maxFoldersPerFolder) + 1; 196 } 197 break; 198 } 199 // log.debug(ret.getType() + ": " + ret.getId()); 200 return ret; 201 } 202 203 protected DocumentMessage createRoot() { 204 folderCount++; 205 return getRandomNodeWithPrefix(String.format("%02d-", getProducerId()), "Folder", ""); 206 } 207 208 protected DocumentMessage createFolder(String parentPath, Set<String> exclude) { 209 DocumentMessage node = getRandomNodeWithExclusion("Folder", parentPath, false, exclude); 210 folderCount++; 211 return node; 212 } 213 214 protected DocumentMessage createDocument(String parentPath, Set<String> exclude) { 215 DocumentMessage node = getRandomNodeWithExclusion("File", parentPath, true, exclude); 216 documentCount++; 217 return node; 218 } 219 220 protected DocumentMessage getRandomNodeWithExclusion(String type, String parentPath, boolean withBlob, 221 Set<String> exclude) { 222 DocumentMessage node = getRandomNode(type, parentPath, withBlob); 223 String name = node.getName(); 224 if (exclude.contains(name)) { 225 String newName = name + "-" + rand.nextInt(exclude.size()); 226 node = DocumentMessage.copy(node, newName); 227 } 228 return node; 229 } 230 231 protected DocumentMessage getRandomNode(String type, String parentPath, boolean withBlob) { 232 String title = getTitle(); 233 String name = getName(title); 234 HashMap<String, Serializable> props = getRandomProperties(title); 235 DocumentMessage.Builder builder = DocumentMessage.builder(type, parentPath, name).setProperties(props); 236 if (withBlob) { 237 if (blobInfoFetcher != null) { 238 BlobInfo blobInfo = blobInfoFetcher.get(builder); 239 if (blobInfo != null) { 240 builder.setBlobInfo(blobInfo); 241 if (blobInfo.mimeType != null) { 242 builder.setType(getDocumentTypeForMimeType(blobInfo.mimeType)); 243 } 244 } 245 } else { 246 builder.setBlob(getRandomBlob()); 247 } 248 } 249 return builder.build(); 250 } 251 252 protected String getDocumentTypeForMimeType(String mimeType) { 253 if (mimeType.startsWith("image")) { 254 return "Picture"; 255 } 256 if (mimeType.startsWith("video")) { 257 return "Video"; 258 } 259 return "File"; 260 } 261 262 protected DocumentMessage getRandomNodeWithPrefix(String prefix, String type, String parentPath) { 263 String title = getTitle(); 264 String name = prefix + getName(title); 265 HashMap<String, Serializable> props = getRandomProperties(title); 266 DocumentMessage.Builder builder = DocumentMessage.builder(type, parentPath, name).setProperties(props); 267 return builder.build(); 268 } 269 270 protected Blob getRandomBlob() { 271 if (blobSizeKB == 0) { 272 return null; 273 } 274 String content = gen.getRandomText(blobSizeKB); 275 return Blobs.createBlob(content, getBlobMimeType(), null, getName(getTitle()) + ".txt"); 276 } 277 278 protected String getBlobMimeType() { 279 if (blobOnlyText) { 280 return "text/plain"; 281 } else { 282 return "text/partial"; 283 } 284 } 285 286 protected String getName(String title) { 287 return title.replaceAll("\\W+", "-").toLowerCase(); 288 } 289 290 protected String getTitle() { 291 return capitalize(gen.getRandomTitle(rand.nextInt(3) + 1).trim()); 292 // return "f" + folderCount; 293 } 294 295 protected String capitalize(final String line) { 296 return Character.toUpperCase(line.charAt(0)) + line.substring(1); 297 } 298 299 protected HashMap<String, Serializable> getRandomProperties(String title) { 300 HashMap<String, Serializable> ret = new HashMap<>(); 301 ret.put("dc:title", title); 302 if (rand.nextInt(10) == 1) { 303 String description = gen.getRandomTitle(rand.nextInt(5) + 1); 304 ret.put("dc:description", capitalize(description)); 305 } 306 ret.put("dc:nature", getGaussian(DC_NATURE)); 307 ret.put("dc:subjects", (Serializable) Collections.singletonList(getGaussian(DC_SUBJECTS))); 308 ret.put("dc:rights", getGaussian(DC_RIGHTS)); 309 ret.put("dc:language", getGaussian(DC_LANGUAGE)); 310 ret.put("dc:coverage", getGaussian(DC_COVERAGE)); 311 ret.put("dc:source", getGaussian(DC_SOURCE)); 312 return ret; 313 } 314 315 protected String getGaussian(String[] words) { 316 double g = Math.abs(rand.nextGaussian() / 4); 317 g = Math.min(g, 1); 318 int i = (int) Math.floor(g * (words.length - 1)); 319 return words[i]; 320 } 321 322 @Override 323 public void close() throws Exception { 324 super.close(); 325 if (blobInfoFetcher != null) { 326 blobInfoFetcher.close(); 327 } 328 } 329 330}