001package squidpony; 002 003import regexodus.Category; 004import regexodus.Matcher; 005import regexodus.Pattern; 006import squidpony.squidmath.Arrangement; 007import squidpony.squidmath.IntIntOrderedMap; 008import squidpony.squidmath.IntVLA; 009 010import java.io.Serializable; 011import java.util.ArrayList; 012 013/** 014 * A simple Markov chain text generator; call {@link #analyze(CharSequence)} once on a large sample text, then you can 015 * call {@link #chain(long)} many times to get odd-sounding "remixes" of the sample text. This is an order-2 Markov 016 * chain, so it chooses the next word based on the previous two words; {@link MarkovTextLimited} is an order-1 Markov 017 * chain, and is faster, but produces lousy output because it only uses one previous word. This is meant to allow easy 018 * serialization of the necessary data to call chain(); if you can store the {@link #words} and {@link #processed} 019 * arrays in some serialized form, then you can reassign them to the same fields to avoid calling analyze(). One way to 020 * do this conveniently is to use {@link #serializeToString()} after calling analyze() once and to save the resulting 021 * String; then, rather than calling analyze() again on future runs, you would call 022 * {@link #deserializeFromString(String)} to create the MarkovText without needing any repeated analysis. 023 * <br> 024 * Created by Tommy Ettinger on 1/30/2018. 025 */ 026public class MarkovText implements Serializable { 027 private static final long serialVersionUID = 1L; 028 029 /** 030 * All words (case-sensitive and counting some punctuation as part of words) that this encountered during the latest 031 * call to {@link #analyze(CharSequence)}. Will be null if {@link #analyze(CharSequence)} was never called. 032 */ 033 public String[] words; 034 035 /** 036 * Map of all pairs of words encountered to the position in the order they were encountered. Pairs are stored using 037 * their 16-bit {@link #words} indices placed into the most-significant bits for the first word and the 038 * least-significant bits for the second word. The size of this IntIntOrderedMap is likely to be larger than the 039 * String array {@link #words}, but should be equal to {@code processed.length}. Will be null if 040 * {@link #analyze(CharSequence)} was never called. 041 */ 042 public IntIntOrderedMap pairs; 043 /** 044 * Complicated data that mixes probabilities of words using their indices in {@link #words} and the indices of word 045 * pairs in {@link #pairs}, generated during the latest call to {@link #analyze(CharSequence)}. This is a jagged 2D 046 * array. Will be null if {@link #analyze(CharSequence)} was never called. 047 */ 048 public int[][] processed; 049 050 private static final String INITIAL = "", FULL_STOP = ".", EXCLAMATION = "!", QUESTION = "?", ELLIPSIS = "..."; 051 private static final Matcher matcher = Pattern.compile("\\.\\.\\.|[\\.!\\?]|[^\\.!\\?\"\\(\\)\\[\\]\\{\\}\\s]+").matcher(); 052 public MarkovText() 053 { 054 } 055 056 /** 057 * This is the main necessary step before using a MarkovText; you must call this method at some point before you can 058 * call any other methods. You can serialize this MarkovText after calling to avoid needing to call this again on later 059 * runs, or even include serialized MarkovText objects with a game to only need to call this during pre-processing. 060 * This method analyzes the pairings of words in a (typically large) corpus text, including some punctuation as part 061 * of words and some kinds as their own "words." It only uses one preceding word to determine the subsequent word. 062 * When it finishes processing, it stores the results in {@link #words} and {@link #processed}, which allows other 063 * methods to be called (they will throw a {@link NullPointerException} if analyze() hasn't been called). 064 * @param corpus a typically-large sample text in the style that should be mimicked 065 */ 066 public void analyze(CharSequence corpus) 067 { 068 Arrangement<String> body = new Arrangement<>((corpus.length() >> 4) + 5); 069 pairs = new IntIntOrderedMap(corpus.length() / 5 + 5); 070 ArrayList<IntVLA> working = new ArrayList<>(corpus.length() / 5 + 5); 071 body.add(INITIAL); 072 working.add(new IntVLA(128)); 073 pairs.put(0, 0); 074 body.add(FULL_STOP); 075 body.add(EXCLAMATION); 076 body.add(QUESTION); 077 body.add(ELLIPSIS); 078// working.add(new IntVLA(links)); 079 080 matcher.setTarget(corpus); 081 int current, pair = 0, pre = 0, post; 082 while (matcher.find()) 083 { 084 current = body.addOrIndex(matcher.group()); 085 pair = pair << 16 | (current & 0xFFFF); 086 post = pairs.putIfAbsent(pair, pairs.size()); 087 if(working.size() != pairs.size()) 088 { 089 working.add(new IntVLA(16)); 090 } 091 working.get(pre).add(current); 092 if(current > 0 && current < 5) 093 { 094 working.get(post).add(0); 095 pair = 0; 096 pre = 0; 097 } 098 else 099 { 100 pre = post; 101 } 102 } 103 IntVLA w = working.get(pre), v; 104 if(w.size == 0) w.add(0); 105 final int len = body.size(), pairLen = working.size(); 106 words = new String[len]; 107 body.keySet().toArray(words); 108 109 processed = new int[pairLen][]; 110 w = new IntVLA(128); 111 IntVLA small = new IntVLA(128); 112 IntVLA large = new IntVLA(128); 113 IntVLA probabilities = new IntVLA(128); 114 for(int iv = 0; iv < pairLen; iv++ ) 115 { 116 v = working.get(iv); 117 w.clear(); 118 probabilities.clear(); 119 if(v.size <= 0) 120 { 121 v.add(1); 122 } 123 int vv, sum = 0; 124 final int vs = v.size; 125 OUTER: 126 for (int i = 0; i < vs; ++i) { 127 vv = v.get(i); 128 for (int j = 0; j < w.size; j++) { 129 if (w.get(j) == vv) { 130 probabilities.incr(j, 0x10000); 131 sum += 0x10000; 132 continue OUTER; 133 } 134 } 135 w.add(vv); 136 probabilities.add(0x10000); 137 sum += 0x10000; 138 } 139 int iAverage = (sum / w.size); 140 141 small.clear(); 142 large.clear(); 143 /* Populate the stacks with the input probabilities. */ 144 for (int i = 0; i < probabilities.size; i++) { 145 /* If the probability is below the average probability, then we add 146 * it to the small list; otherwise we add it to the large list. 147 */ 148 if (probabilities.get(i) >= iAverage) 149 large.add(i); 150 else 151 small.add(i); 152 } 153 154 processed[iv] = new int[w.size * 3]; 155 156 while (!small.isEmpty() && !large.isEmpty()) { 157 /* Get the index of the small and the large probabilities. */ 158 int less = small.pop(), less2 = less * 3; 159 int more = large.pop(); 160 161 /* These probabilities have not yet been scaled up to be such that 162 * sum/n is given weight 1.0. We do this here instead. 163 */ 164 processed[iv][less2] = (probabilities.size * probabilities.get(less)) / (sum >> 16); 165 processed[iv][less2+1] = w.get(less); 166 processed[iv][less2+2] = w.get(more); 167 vv = probabilities.get(less) - iAverage; 168 probabilities.incr(more, vv); 169 if (probabilities.get(more) >= iAverage) 170 large.add(more); 171 else 172 small.add(more); 173 } 174 int t; 175 while (!small.isEmpty()) 176 { 177 processed[iv][(t = small.pop()) * 3] = 0xFFFF; 178 processed[iv][t * 3 + 1] = processed[iv][t * 3 + 2] = w.get(t); 179 } 180 while (!large.isEmpty()) 181 { 182 processed[iv][(t = large.pop()) * 3] = 0xFFFF; 183 processed[iv][t * 3 + 1] = processed[iv][t * 3 + 2] = w.get(t); 184 } 185 } 186 } 187 188 /** 189 * After calling {@link #analyze(CharSequence)}, you can optionally call this to alter any words in this MarkovText that 190 * were used as a proper noun (determined by whether they were capitalized in the middle of a sentence), changing 191 * them to a ciphered version using the given {@link NaturalLanguageCipher}. Normally you would initialize a 192 * NaturalLanguageCipher with a {@link FakeLanguageGen} that matches the style you want for all names in this text, 193 * then pass that to this method during pre-processing (not necessarily at runtime, since this method isn't 194 * especially fast if the corpus was large). This method modifies this MarkovText in-place. 195 * @param translator a NaturalLanguageCipher that will be used to translate proper nouns in this MarkovText's word array 196 */ 197 public void changeNames(NaturalLanguageCipher translator) 198 { 199 String name; 200 PER_WORD: 201 for (int i = 5; i < words.length; i++) { 202 if(Category.Lu.contains((name = words[i]).charAt(0))) 203 { 204 for (int w = 5; w < words.length; w++) { 205 for (int p = 0; p < processed[w].length; p++) { 206 if (i == processed[w][++p] || i == processed[w][++p]) 207 { 208 words[i] = translator.cipher(name); 209 continue PER_WORD; 210 } 211 } 212 } 213 } 214 } 215 } 216 /** 217 * Generate a roughly-sentence-sized piece of text based on the previously analyzed corpus text (using 218 * {@link #analyze(CharSequence)}) that terminates when stop punctuation is used (".", "!", "?", or "..."), or once 219 * the length would be greater than 200 characters without encountering stop punctuation(it terminates such a 220 * sentence with "." or "..."). 221 * @param seed the seed for the random decisions this makes, as a long; any long can be used 222 * @return a String generated from the analyzed corpus text's word placement, usually a small sentence 223 */ 224 public String chain(long seed) { 225 return chain(seed, 200); 226 } 227 228 /** 229 * Generate a roughly-sentence-sized piece of text based on the previously analyzed corpus text (using 230 * {@link #analyze(CharSequence)}) that terminates when stop punctuation is used (".", "!", "?", or "...") or once 231 * the maxLength would be exceeded by any other words (it terminates such a sentence with "." or "..."). 232 * @param seed the seed for the random decisions this makes, as a long; any long can be used 233 * @param maxLength the maximum length for the generated String, in number of characters 234 * @return a String generated from the analyzed corpus text's word placement, usually a small sentence 235 */ 236 public String chain(long seed, int maxLength) { 237 int before, pair = 0; 238 boolean later; 239 long state; 240 StringBuilder sb = new StringBuilder(1000); 241 int[] rf; 242 while (sb.length() < maxLength) { 243 if(sb.length() >= maxLength - 3) 244 { 245 sb.append('.'); 246 break; 247 } 248 later = (pair != 0); 249 rf = processed[pairs.get(pair)]; 250 // This is LightRNG's algorithm to generate a random long given sequential states 251 state = ((state = ((state = ((seed += 0x9E3779B97F4A7C15L) ^ seed >>> 30) * 0xBF58476D1CE4E5B9L) ^ state >>> 27) * 0x94D049BB133111EBL) ^ state >>> 31); 252 // get a random int (using half the bits of our previously-calculated state) that is less than size 253 int column = (int) ((rf.length * (state & 0xFFFFFFFFL)) / 0x300000000L) * 3; // divide by 2^32, round down to multiple of 3 254 // use the other half of the bits of state to get a double, compare to probability and choose either the 255 // current column or the alias for that column based on that probability 256 //before = ((state >>> 33) > rf[column]) ? rf[column + 1] : rf[column + 2]; 257 if((state >>> 48) > rf[column]) 258 before = rf[column + 1]; 259 else 260 before = rf[column + 2]; 261 if(before >= 5) 262 { 263 if(sb.length() + words[before].length() + 1 < maxLength) 264 { 265 if(later) 266 sb.append(' '); 267 sb.append(words[before]); 268 pair = pair << 16 | (before & 0xFFFF); 269 } 270 else 271 { 272 if(sb.length() + 3 <= maxLength) 273 sb.append("..."); 274 else 275 sb.append('.'); 276 break; 277 } 278 } 279 else if(before != 0) 280 { 281 sb.append(words[before]); 282 break; 283 } 284 } 285 return sb.toString(); 286 } 287 288 /** 289 * Returns a representation of this MarkovText as a String; use {@link #deserializeFromString(String)} to get a 290 * MarkovText back from this String. The {@link #words} and {@link #processed} fields must have been given values by 291 * either direct assignment, calling {@link #analyze(CharSequence)}, or building this MarkovTest with the 292 * aforementioned deserializeToString method. Uses spaces to separate words and a tab to separate the two fields. 293 * @return a String that can be used to store the analyzed words and frequencies in this MarkovText 294 */ 295 public String serializeToString() 296 { 297 return StringKit.join(" ", words) + "\t" + StringKit.join(",", pairs.keysAsArray()) + "\t" + Converters.convertArrayInt2D.stringify(processed); 298 } 299 300 /** 301 * Recreates an already-analyzed MarkovText given a String produced by {@link #serializeToString()}. 302 * @param data a String returned by {@link #serializeToString()} 303 * @return a MarkovText that is ready to generate text with {@link #chain(long)} 304 */ 305 public static MarkovText deserializeFromString(String data) 306 { 307 int split = data.indexOf('\t'); 308 MarkovText markov = new MarkovText(); 309 markov.words = StringKit.split(data.substring(0, split), " "); 310 int[] arr = Converters.convertArrayInt.restore(data.substring(split+1, split = data.indexOf('\t', split + 1))); 311 markov.pairs = new IntIntOrderedMap(arr, ArrayTools.range(arr.length)); 312 markov.processed = Converters.convertArrayInt2D.restore(data.substring(split + 1)); 313 return markov; 314 } 315 316 /** 317 * Copies the String array {@link #words} and the 2D jagged int array {@link #processed} into a new MarkovText. 318 * None of the arrays will be equivalent references, but the Strings (being immutable) will be the same objects in 319 * both MarkovText instances. This is primarily useful with {@link #changeNames(NaturalLanguageCipher)}, which can 320 * produce several variants on names given several initial copies produced with this method. 321 * @return a copy of this MarkovText 322 */ 323 public MarkovText copy() 324 { 325 MarkovText other = new MarkovText(); 326 other.words = new String[words.length]; 327 System.arraycopy(words, 0, other.words, 0, words.length); 328 other.processed = new int[processed.length][]; 329 int len; 330 for (int i = 0; i < processed.length; i++) { 331 other.processed[i] = new int[len = processed[i].length]; 332 System.arraycopy(processed[i], 0, other.processed[i], 0, len); 333 } 334 return other; 335 } 336}