001/* 002 * (C) Copyright 2016 Nuxeo SA (http://nuxeo.com/) and others. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 * 016 */ 017package org.nuxeo.importer.stream.producer; 018 019import java.io.Serializable; 020import java.util.ArrayList; 021import java.util.Collections; 022import java.util.HashMap; 023import java.util.HashSet; 024import java.util.List; 025import java.util.Random; 026import java.util.Set; 027import java.util.concurrent.ThreadLocalRandom; 028 029import org.apache.commons.logging.Log; 030import org.apache.commons.logging.LogFactory; 031import org.nuxeo.ecm.core.api.Blob; 032import org.nuxeo.ecm.core.api.Blobs; 033import org.nuxeo.ecm.platform.importer.random.HunspellDictionaryHolder; 034import org.nuxeo.ecm.platform.importer.random.RandomTextGenerator; 035import org.nuxeo.importer.stream.message.DocumentMessage; 036import org.nuxeo.lib.stream.pattern.producer.AbstractProducer; 037 038/** 039 * @since 9.1 040 */ 041public class RandomDocumentMessageProducer extends AbstractProducer<DocumentMessage> { 042 private static final Log log = LogFactory.getLog(RandomDocumentMessageProducer.class); 043 044 protected final long nbDocuments; 045 046 protected final BlobInfoFetcher blobInfoFetcher; 047 048 protected boolean countFolderAsDocument = true; 049 050 protected int maxFoldersPerFolder = 50; 051 052 protected int maxDocumentsPerFolder = 500; 053 054 protected int blobSizeKB = 0; 055 056 protected boolean blobOnlyText = false; 057 058 protected int documentCount = 0; 059 060 protected int folderCount = 0; 061 062 protected final Random rand; 063 064 protected static RandomTextGenerator gen; 065 066 static protected final String[] DC_NATURE = { "article", "acknowledgement", "assessment", "application", "order", 067 "contract", "quotation", "fax", "worksheet", "letter", "memo", "note", "notification", "procedure", 068 "report", "internshipReport", "pressReview" }; 069 070 static protected final String[] DC_SUBJECTS = { "art/architecture", "art/comics", "art/cinema", "art/culture", 071 "art/danse", "art/music", "sciences/astronomy", "sciences/biology", "sciences/chemistry", "sciences/math", 072 "sciences/physic", "society/ecology", "daily life/gastronomy", "daily life/gardening", "daily life/sport", 073 "technology/it" }; 074 075 static protected final String[] DC_RIGHTS = { "OpenContentL", "CC-BY-NC", "CC-BY-ND", "FreeArt", "ODbi", "GNUGPL", 076 "FreeBSD", "CC0" }; 077 078 static protected final String[] DC_LANGUAGE = { "IT", "DE", "FR", "US", "EN" }; 079 080 static protected final String[] DC_SOURCE = { "internal", "external", "unknown" }; 081 082 static protected final String[] DC_COVERAGE = { "europe/France", "europe/Germany", "europe/Italy", "europe/Spain", 083 "oceania/Tonga", "africa/Mali", "asia/Japan", "north-america/United_States_of_America" }; 084 085 protected int foldersInCurrentFolderLimit; 086 087 protected int documentInCurrentFolderLimit; 088 089 protected enum DocType { 090 Root, Folder, Document 091 } 092 093 protected DocType currentType = DocType.Root; 094 095 protected int parentIndex = 0; 096 097 protected List<String> parents = new ArrayList<>(); 098 099 protected List<String> folderishChildren = new ArrayList<>(); 100 101 protected Set<String> children = new HashSet<>(); 102 103 protected int documentInCurrentFolderCount = 0; 104 105 public RandomDocumentMessageProducer(int producerId, long nbDocuments, String lang, 106 BlobInfoFetcher blobInfoFetcher) { 107 super(producerId); 108 this.nbDocuments = nbDocuments; 109 rand = ThreadLocalRandom.current(); 110 111 synchronized (RandomDocumentMessageProducer.class) { 112 if (gen == null) { 113 gen = new RandomTextGenerator(new HunspellDictionaryHolder(lang)); 114 gen.prefilCache(); 115 } 116 } 117 this.blobInfoFetcher = blobInfoFetcher; 118 log.info("RandomDocumentMessageProducer created, nbDocuments: " + nbDocuments); 119 } 120 121 public RandomDocumentMessageProducer setMaxFoldersPerFolder(int max) { 122 maxFoldersPerFolder = max; 123 return this; 124 } 125 126 public RandomDocumentMessageProducer setMaxDocumentsPerFolder(int max) { 127 maxDocumentsPerFolder = max; 128 return this; 129 } 130 131 public RandomDocumentMessageProducer countFolderAsDocument(boolean value) { 132 countFolderAsDocument = value; 133 return this; 134 } 135 136 public RandomDocumentMessageProducer withBlob(int sizeKB, boolean onlyText) { 137 this.blobSizeKB = sizeKB; 138 this.blobOnlyText = onlyText; 139 return this; 140 } 141 142 @Override 143 public int getPartition(DocumentMessage message, int partitions) { 144 return getProducerId() % partitions; 145 } 146 147 @Override 148 public boolean hasNext() { 149 if (countFolderAsDocument) { 150 return (documentCount + folderCount) < nbDocuments; 151 } 152 return documentCount <= nbDocuments; 153 } 154 155 @Override 156 public DocumentMessage next() { 157 DocumentMessage ret; 158 switch (currentType) { 159 case Root: 160 ret = createRoot(); 161 parents.add(ret.getId()); 162 currentType = DocType.Folder; 163 foldersInCurrentFolderLimit = rand.nextInt(maxFoldersPerFolder) + 1; 164 break; 165 case Folder: 166 ret = createFolder(parents.get(parentIndex), children); 167 folderishChildren.add(ret.getId()); 168 children.add(ret.getName()); 169 if (folderishChildren.size() >= foldersInCurrentFolderLimit) { 170 currentType = DocType.Document; 171 documentInCurrentFolderCount = 0; 172 documentInCurrentFolderLimit = rand.nextInt(maxDocumentsPerFolder); 173 } 174 break; 175 default: 176 case Document: 177 ret = createDocument(parents.get(parentIndex), children); 178 children.add(ret.getName()); 179 documentInCurrentFolderCount += 1; 180 if (documentInCurrentFolderCount > documentInCurrentFolderLimit) { 181 parentIndex += 1; 182 if (parentIndex >= parents.size()) { 183 parents.clear(); 184 parents = folderishChildren; 185 folderishChildren = new ArrayList<>(); 186 children = new HashSet<>(); 187 parentIndex = 0; 188 } 189 currentType = DocType.Folder; 190 foldersInCurrentFolderLimit = rand.nextInt(maxFoldersPerFolder) + 1; 191 } 192 break; 193 } 194 // log.debug(ret.getType() + ": " + ret.getId()); 195 return ret; 196 } 197 198 protected DocumentMessage createRoot() { 199 folderCount++; 200 return getRandomNodeWithPrefix(String.format("%02d-", getProducerId()), "Folder", ""); 201 } 202 203 protected DocumentMessage createFolder(String parentPath, Set<String> exclude) { 204 DocumentMessage node = getRandomNodeWithExclusion("Folder", parentPath, false, exclude); 205 folderCount++; 206 return node; 207 } 208 209 protected DocumentMessage createDocument(String parentPath, Set<String> exclude) { 210 DocumentMessage node = getRandomNodeWithExclusion("File", parentPath, true, exclude); 211 documentCount++; 212 return node; 213 } 214 215 protected DocumentMessage getRandomNodeWithExclusion(String type, String parentPath, boolean withBlob, 216 Set<String> exclude) { 217 DocumentMessage node = getRandomNode(type, parentPath, withBlob); 218 String name = node.getName(); 219 if (exclude.contains(name)) { 220 String newName = name + "-" + rand.nextInt(exclude.size()); 221 node = DocumentMessage.copy(node, newName); 222 } 223 return node; 224 } 225 226 protected DocumentMessage getRandomNode(String type, String parentPath, boolean withBlob) { 227 String title = getTitle(); 228 String name = getName(title); 229 HashMap<String, Serializable> props = getRandomProperties(title); 230 DocumentMessage.Builder builder = DocumentMessage.builder(type, parentPath, name).setProperties(props); 231 if (withBlob) { 232 if (blobInfoFetcher != null) { 233 builder.setBlobInfo(blobInfoFetcher.get(builder)); 234 } else { 235 builder.setBlob(getRandomBlob()); 236 } 237 } 238 return builder.build(); 239 } 240 241 protected DocumentMessage getRandomNodeWithPrefix(String prefix, String type, String parentPath) { 242 String title = getTitle(); 243 String name = prefix + getName(title); 244 HashMap<String, Serializable> props = getRandomProperties(title); 245 DocumentMessage.Builder builder = DocumentMessage.builder(type, parentPath, name).setProperties(props); 246 if (blobInfoFetcher != null) { 247 builder.setBlobInfo(blobInfoFetcher.get(builder)); 248 } else { 249 builder.setBlob(getRandomBlob()); 250 } 251 return builder.build(); 252 } 253 254 protected Blob getRandomBlob() { 255 if (blobSizeKB == 0) { 256 return null; 257 } 258 String content = gen.getRandomText(blobSizeKB); 259 return Blobs.createBlob(content, getBlobMimeType(), null, getName(getTitle()) + ".txt"); 260 } 261 262 protected String getBlobMimeType() { 263 if (blobOnlyText) { 264 return "text/plain"; 265 } else { 266 return "text/partial"; 267 } 268 } 269 270 protected String getName(String title) { 271 return title.replaceAll("\\W+", "-").toLowerCase(); 272 } 273 274 protected String getTitle() { 275 return capitalize(gen.getRandomTitle(rand.nextInt(3) + 1).trim()); 276 // return "f" + folderCount; 277 } 278 279 protected String capitalize(final String line) { 280 return Character.toUpperCase(line.charAt(0)) + line.substring(1); 281 } 282 283 protected HashMap<String, Serializable> getRandomProperties(String title) { 284 HashMap<String, Serializable> ret = new HashMap<>(); 285 ret.put("dc:title", title); 286 if (rand.nextInt(10) == 1) { 287 String description = gen.getRandomTitle(rand.nextInt(5) + 1); 288 ret.put("dc:description", capitalize(description)); 289 } 290 ret.put("dc:nature", getGaussian(DC_NATURE)); 291 ret.put("dc:subjects", (Serializable) Collections.singletonList(getGaussian(DC_SUBJECTS))); 292 ret.put("dc:rights", getGaussian(DC_RIGHTS)); 293 ret.put("dc:language", getGaussian(DC_LANGUAGE)); 294 ret.put("dc:coverage", getGaussian(DC_COVERAGE)); 295 ret.put("dc:source", getGaussian(DC_SOURCE)); 296 return ret; 297 } 298 299 protected String getGaussian(String[] words) { 300 double g = Math.abs(rand.nextGaussian() / 4); 301 g = Math.min(g, 1); 302 int i = (int) Math.floor(g * (words.length - 1)); 303 return words[i]; 304 } 305 306 @Override 307 public void close() throws Exception { 308 super.close(); 309 if (blobInfoFetcher != null) { 310 blobInfoFetcher.close(); 311 } 312 } 313 314}