001package squidpony; 002 003import squidpony.squidmath.Arrangement; 004import squidpony.squidmath.IntVLA; 005 006import java.io.Serializable; 007import java.util.ArrayList; 008import java.util.List; 009 010/** 011 * A simple Markov chain generator that works with Lists of some type instead of text like {@link MarkovTextLimited}. 012 * Call {@link #analyze(Iterable)} or {@link #analyze(Object[])} once on a large sample Iterable or array where 013 * sequences of items matter (this is called a corpus, and could be e.g. a List or an array), then you can call 014 * {@link #chain(long)} many times to get "remixes" of the sample Iterable/array as a List. This is meant to allow easy 015 * serialization of the necessary data to call chain(); if you can store the {@link #body} and {@link #processed} data 016 * structures in some serialized form, then you can reassign them to the same fields to avoid calling analyze(). This 017 * requires some way to serialize body, which is an {@link Arrangement} of T, and so T must be serializable in some way 018 * (not necessarily the {@link java.io.Serializable} interface, but possibly that). 019 * <br> 020 * Created by Tommy Ettinger on 2/26/2018. 021 */ 022public class MarkovObject<T> implements Serializable { 023 private static final long serialVersionUID = 0L; 024 025 /** 026 * All unique T items that this encountered during the latest call to {@link #analyze(Iterable)}. 027 * Will be null if analyze() was never called. 028 */ 029 public Arrangement<T> body; 030 /** 031 * Complicated data that mixes probabilities and the indices of items in {@link #body}, generated during the latest 032 * call to {@link #analyze(Iterable)}. Will be null if analyze() was never called. 033 */ 034 public ArrayList<IntVLA> processed; 035 public ArrayList<IntVLA> raw; 036 037 public MarkovObject() 038 { 039 } 040 041 /** 042 * This is the main necessary step before using a MarkovObject; you must call this method at some point before you 043 * can call any other methods. This method analyzes the pairings of items in a (typically large) corpus Iterable. 044 * It only uses one preceding item to determine the subsequent word. It does not store any items as special stop 045 * terms, but it does use {@code null} to represent the start of a section (effectively treating any corpus as 046 * starting with null prepended), and will not produce null as output from {@link #chain(long)}. If null is 047 * encountered as part of corpus, it will be interpreted as a point to stop on and potentially start a new section. 048 * Since the last item in the corpus could have no known items to produce after it, the end of the corpus is treated 049 * as having null appended as well. When it finishes processing, it stores the results in {@link #body} and 050 * {@link #processed}, which allows other methods to be called (they will throw a {@link NullPointerException} if 051 * analyze() hasn't been called). 052 * <br> 053 * Unlike in {@link MarkovTextLimited}, you can analyze multiple corpus Iterables by calling this method more than once. 054 * 055 * @param corpus a typically-large sample Iterable in the style that should be mimicked 056 */ 057 public void analyze(Iterable<T> corpus) 058 { 059 if(body == null || processed == null) { 060 body = new Arrangement<>(64); 061 body.add(null); 062 raw = new ArrayList<>(64); 063 raw.add(new IntVLA(128)); 064 processed = new ArrayList<>(64); 065 processed.add(new IntVLA(128)); 066 } 067 int previous = 0, current; 068 for (T item : corpus) 069 { 070 current = body.addOrIndex(item); 071 if(raw.size() != body.size()) 072 { 073 raw.add(new IntVLA(16)); 074 } 075 raw.get(previous).add(current); 076 previous = current; 077 } 078 raw.get(previous).add(0); 079 IntVLA w, v; 080 final int len = raw.size(); 081 processed.ensureCapacity(len); 082 //processed = new int[len][]; 083 w = new IntVLA(128); 084 IntVLA small = new IntVLA(128); 085 IntVLA large = new IntVLA(128); 086 IntVLA probabilities = new IntVLA(128); 087 for(int iv = 0; iv < len; iv++ ) 088 { 089 v = raw.get(iv); 090 w.clear(); 091 probabilities.clear(); 092 if(v.size <= 0) 093 { 094 v.add(1); 095 } 096 int vv; 097 final long vs = v.size; 098 OUTER: 099 for (int i = 0; i < v.size; ++i) { 100 vv = v.get(i); 101 for (int j = 0; j < w.size; j++) { 102 if (w.get(j) == vv) { 103 probabilities.incr(j, 1); 104 continue OUTER; 105 } 106 } 107 w.add(vv); 108 probabilities.add(1); 109 } 110 final int iAverage = (int)((0x7FFFFFFFL * w.size) / v.size); 111 112 small.clear(); 113 large.clear(); 114 /* Populate the stacks with the input probabilities. */ 115 for (int i = 0; i < probabilities.size; i++) { 116 /* If the probability is below the average probability, then we add 117 * it to the small list; otherwise we add it to the large list. 118 */ 119 if (probabilities.get(i) * 0x7FFFFFFFL >= iAverage * vs) 120 large.add(i); 121 else 122 small.add(i); 123 } 124 if(processed.size() <= iv) 125 processed.add(v = new IntVLA(w.size * 3)); 126 else 127 { 128 v = processed.get(iv); 129 v.clear(); 130 } 131 final int[] va = v.setSize(w.size * 3); 132 //processed[iv] = new int[w.size * 3]; 133 134 while (!small.isEmpty() && !large.isEmpty()) { 135 /* Get the index of the small and the large probabilities. */ 136 int less = small.pop(), less2 = less * 3; 137 int more = large.pop(); 138 139 /* These probabilities have not yet been scaled up to be such that 140 * sum/n is given weight 1.0. We do this here instead. 141 */ 142 va[less2] = iAverage * probabilities.get(less); 143 va[less2+1] = w.get(less); 144 va[less2+2] = w.get(more); 145 146 probabilities.incr(more, probabilities.get(less) - iAverage); 147 148 if (probabilities.get(more) >= iAverage) 149 large.add(more); 150 else 151 small.add(more); 152 } 153 int t; 154 while (!small.isEmpty()) 155 { 156 va[(t = small.pop()) * 3] = 0x7FFFFFFF; 157 va[t * 3 + 1] = va[t * 3 + 2] = w.get(t); 158 } 159 while (!large.isEmpty()) 160 { 161 va[(t = large.pop()) * 3] = 0x7FFFFFFF; 162 va[t * 3 + 1] = va[t * 3 + 2] = w.get(t); 163 } 164 } 165 } 166 167 /** 168 * This is the main necessary step before using a MarkovObject; you must call this method at some point before you 169 * can call any other methods. This method analyzes the pairings of items in a (typically large) corpus array of T. 170 * It only uses one preceding item to determine the subsequent word. It does not store any items as special stop 171 * terms, but it does use {@code null} to represent the start of a section (effectively treating any corpus as 172 * starting with null prepended), and will not produce null as output from {@link #chain(long)}. If null is 173 * encountered as part of corpus, it will be interpreted as a point to stop on and potentially start a new section. 174 * Since the last item in the corpus could have no known items to produce after it, the end of the corpus is treated 175 * as having null appended as well. When it finishes processing, it stores the results in {@link #body} and 176 * {@link #processed}, which allows other methods to be called (they will throw a {@link NullPointerException} if 177 * analyze() hasn't been called). 178 * <br> 179 * Unlike in {@link MarkovTextLimited}, you can analyze multiple corpus arrays by calling this method more than once. 180 * 181 * @param corpus a typically-large sample array of T in the style that should be mimicked 182 */ 183 public void analyze(T[] corpus) 184 { 185 if(body == null || processed == null) { 186 body = new Arrangement<>(corpus.length * 3 >> 2); 187 body.add(null); 188 raw = new ArrayList<>(corpus.length * 3 >> 2); 189 raw.add(new IntVLA(128)); 190 processed = new ArrayList<>(corpus.length * 3 >> 2); 191 processed.add(new IntVLA(128)); 192 } 193 int previous = 0, current; 194 for (T item : corpus) 195 { 196 current = body.addOrIndex(item); 197 if(raw.size() != body.size()) 198 { 199 raw.add(new IntVLA(16)); 200 } 201 raw.get(previous).add(current); 202 previous = current; 203 } 204 raw.get(previous).add(0); 205 IntVLA w, v; 206 final int len = raw.size(); 207 processed.ensureCapacity(len); 208 //processed = new int[len][]; 209 w = new IntVLA(128); 210 IntVLA small = new IntVLA(128); 211 IntVLA large = new IntVLA(128); 212 IntVLA probabilities = new IntVLA(128); 213 for(int iv = 0; iv < len; iv++ ) 214 { 215 v = raw.get(iv); 216 w.clear(); 217 probabilities.clear(); 218 if(v.size <= 0) 219 { 220 v.add(1); 221 } 222 int vv; 223 final long vs = v.size; 224 OUTER: 225 for (int i = 0; i < v.size; ++i) { 226 vv = v.get(i); 227 for (int j = 0; j < w.size; j++) { 228 if (w.get(j) == vv) { 229 probabilities.incr(j, 1); 230 continue OUTER; 231 } 232 } 233 w.add(vv); 234 probabilities.add(1); 235 } 236 final int iAverage = (int)((0x7FFFFFFFL * w.size) / v.size); 237 238 small.clear(); 239 large.clear(); 240 /* Populate the stacks with the input probabilities. */ 241 for (int i = 0; i < probabilities.size; i++) { 242 /* If the probability is below the average probability, then we add 243 * it to the small list; otherwise we add it to the large list. 244 */ 245 if (probabilities.get(i) * 0x7FFFFFFFL >= iAverage * vs) 246 large.add(i); 247 else 248 small.add(i); 249 } 250 if(processed.size() <= iv) 251 processed.add(v = new IntVLA(w.size * 3)); 252 else 253 { 254 v = processed.get(iv); 255 v.clear(); 256 } 257 final int[] va = v.setSize(w.size * 3); 258 //processed[iv] = new int[w.size * 3]; 259 260 while (!small.isEmpty() && !large.isEmpty()) { 261 /* Get the index of the small and the large probabilities. */ 262 int less = small.pop(), less2 = less * 3; 263 int more = large.pop(); 264 265 /* These probabilities have not yet been scaled up to be such that 266 * sum/n is given weight 1.0. We do this here instead. 267 */ 268 va[less2] = iAverage * probabilities.get(less); 269 va[less2+1] = w.get(less); 270 va[less2+2] = w.get(more); 271 272 probabilities.incr(more, probabilities.get(less) - iAverage); 273 274 if (probabilities.get(more) >= iAverage) 275 large.add(more); 276 else 277 small.add(more); 278 } 279 int t; 280 while (!small.isEmpty()) 281 { 282 va[(t = small.pop()) * 3] = 0x7FFFFFFF; 283 va[t * 3 + 1] = va[t * 3 + 2] = w.get(t); 284 } 285 while (!large.isEmpty()) 286 { 287 va[(t = large.pop()) * 3] = 0x7FFFFFFF; 288 va[t * 3 + 1] = va[t * 3 + 2] = w.get(t); 289 } 290 } 291 } 292 293 /** 294 * Generates a 32-element List of T based on the given seed and previously analyzed corpus data (using 295 * {@link #analyze(Iterable)}). This can't stop before generating a chain of 32 items unless analyze() hasn't been 296 * called or it was called on an empty or invalid Iterable/array (i.e. all null). 297 * @param seed the seed for the random decisions this makes, as a long; any long can be used 298 * @return a 32-element T List generated from the analyzed corpus Iterable/array's pairings of items 299 */ 300 public List<T> chain(long seed) { 301 return chain(seed, 32, false, new ArrayList<T>(32)); 302 } 303 304 /** 305 * Adds T items to buffer to fill it up to maxLength, based on the given seed and previously analyzed corpus 306 * data (using {@link #analyze(Iterable)}). If buffer is already at least as long as maxLength, if analyze() hasn't 307 * been called or if it was called on an empty or invalid Iterable/array (i.e. all null), then this won't change 308 * buffer and will return it as-is. If null was present in the analyzed corpus along with other items and 309 * canStopEarly is true, then if null would be generated this will instead stop adding items to buffer and return 310 * buffer as it is. If canStopEarly was false in the last case, the generated null would be discarded and a value 311 * from the start of the corpus or following a null in the corpus would be used instead. 312 * @param seed the seed for the random decisions this makes, as a long; any long can be used 313 * @param maxLength the maximum length for the generated List, in items 314 * @param canStopEarly if true, this may add less than maxLength elements if null was present in the corpus 315 * @param buffer a List of T that will have elements added until maxLength is reached; if it already is larger than 316 * maxLength this won't do anything 317 * @return buffer, after items were added to fill maxLength (or to fill less if this stopped early) 318 */ 319 public List<T> chain(long seed, int maxLength, boolean canStopEarly, List<T> buffer) { 320 if(body == null || body.size() <= 1) 321 return buffer; 322 int before = 0; 323 long state; 324 IntVLA rf; 325 while (buffer.size() < maxLength) { 326 rf = processed.get(before); 327 // This is LightRNG's algorithm to generate a random long given sequential states 328 state = ((state = ((state = ((seed += 0x9E3779B97F4A7C15L) ^ seed >>> 30) * 0xBF58476D1CE4E5B9L) ^ state >>> 27) * 0x94D049BB133111EBL) ^ state >>> 31); 329 330 // get a random int (using half the bits of our previously-calculated state) that is less than size 331 int column = (int) ((rf.size * (state & 0xFFFFFFFFL)) / 0x300000000L) * 3; // divide by 2^32, round down to multiple of 3 332 // use the other half of the bits of state to get a double, compare to probability and choose either the 333 // current column or the alias for that column based on that probability 334 before = ((state >>> 33) <= rf.get(column)) ? rf.get(column + 1) : rf.get(column + 2); 335 if(before != 0) 336 { 337 buffer.add(body.keyAt(before)); 338 } 339 else if(canStopEarly) 340 { 341 break; 342 } 343 } 344 return buffer; 345 } 346 347 /** 348 * Copies the T items in {@link #body} and the int-based data structure {@link #processed} into a new MarkovObject. 349 * None of the inner values, such as IntVLA values in processed, will be equivalent references, but the items in 350 * body will be the same objects in both MarkovObject instances. 351 * @return a copy of this MarkovObject 352 */ 353 public MarkovObject<T> copy() 354 { 355 MarkovObject<T> other = new MarkovObject<>(); 356 other.body = new Arrangement<>(body); 357 other.processed = new ArrayList<>(processed.size()); 358 for (int i = 0; i < processed.size(); i++) { 359 other.processed.add(new IntVLA(processed.get(i))); 360 } 361 return other; 362 } 363}