001package squidpony; 002 003import regexodus.Category; 004import regexodus.Matcher; 005import regexodus.Pattern; 006import squidpony.squidmath.Arrangement; 007import squidpony.squidmath.IntVLA; 008 009import java.io.Serializable; 010import java.util.ArrayList; 011 012/** 013 * A simple Markov chain text generator; it is called "Limited" because it only can be used as an order-1 Markov chain, 014 * meaning only one prior word is looked at. To use, call {@link #analyze(CharSequence)} once on a large sample text, 015 * then you can call {@link #chain(long)} many times to get odd-sounding "remixes" of the sample text. For more natural 016 * output, you can use {@link MarkovText}, which is an order-2 Markov chain and so looks at two previous words. This is 017 * meant to allow easy serialization of the necessary data to call chain(); if you can store the {@link #words} and 018 * {@link #processed} arrays in some serialized form, then you can reassign them to the same fields to avoid calling 019 * analyze(). One way to do this conveniently is to use {@link #serializeToString()} after calling analyze() once and to 020 * save the resulting String; then, rather than calling analyze() again on future runs, you would call 021 * {@link #deserializeFromString(String)} to create the MarkovTextLimited without needing any repeated analysis. 022 * <br> 023 * Created by Tommy Ettinger on 1/30/2018. 024 */ 025public class MarkovTextLimited implements Serializable { 026 private static final long serialVersionUID = 0L; 027 028 /** 029 * All words (case-sensitive and counting some punctuation as part of words) that this encountered during the latest 030 * call to {@link #analyze(CharSequence)}. Will be null if analyze() was never called. 031 */ 032 public String[] words; 033 /** 034 * Complicated data that mixes probabilities and the indices of words in {@link #words}, generated during the latest 035 * call to {@link #analyze(CharSequence)}. This is a jagged 2D array. Will be null if analyze() was never called. 036 */ 037 public int[][] processed; 038 039 private static final String INITIAL = "", FULL_STOP = ".", EXCLAMATION = "!", QUESTION = "?", ELLIPSIS = "..."; 040 private static final Matcher matcher = Pattern.compile("\\.\\.\\.|[\\.!\\?]|[^\\.!\\?\"\\(\\)\\[\\]\\{\\}\\s]+").matcher(); 041 public MarkovTextLimited() 042 { 043 } 044 045 /** 046 * This is the main necessary step before using a MarkovTextLimited; you must call this method at some point before you can 047 * call any other methods. You can serialize this MarkovTextLimited after calling to avoid needing to call this again on later 048 * runs, or even include serialized MarkovTextLimited objects with a game to only need to call this during pre-processing. 049 * This method analyzes the pairings of words in a (typically large) corpus text, including some punctuation as part 050 * of words and some kinds as their own "words." It only uses one preceding word to determine the subsequent word. 051 * When it finishes processing, it stores the results in {@link #words} and {@link #processed}, which allows other 052 * methods to be called (they will throw a {@link NullPointerException} if analyze() hasn't been called). 053 * @param corpus a typically-large sample text in the style that should be mimicked 054 */ 055 public void analyze(CharSequence corpus) 056 { 057 Arrangement<String> body = new Arrangement<>(corpus.length() / 5 + 5); 058 body.add(INITIAL); 059 body.add(FULL_STOP); 060 body.add(EXCLAMATION); 061 body.add(QUESTION); 062 body.add(ELLIPSIS); 063 ArrayList<IntVLA> working = new ArrayList<>(corpus.length() / 5 + 5); 064 working.add(new IntVLA(128)); 065 int[] links = {0}; 066 working.add(new IntVLA(links)); 067 working.add(new IntVLA(links)); 068 working.add(new IntVLA(links)); 069 working.add(new IntVLA(links)); 070 071 matcher.setTarget(corpus); 072 int previous = 0, current; 073 while (matcher.find()) 074 { 075 current = body.addOrIndex(matcher.group()); 076 if(working.size() != body.size()) 077 { 078 working.add(new IntVLA(16)); 079 } 080 working.get(previous).add(current); 081 if(current > 0 && current < 5) 082 { 083 working.get(current).add(0); 084 previous = 0; 085 } 086 else 087 previous = current; 088 } 089 IntVLA w = working.get(previous), v; 090 if(w.size == 0) w.add(0); 091 final int len = working.size(); 092 words = new String[len]; 093 body.keySet().toArray(words); 094 processed = new int[len][]; 095 w = new IntVLA(128); 096 IntVLA small = new IntVLA(128); 097 IntVLA large = new IntVLA(128); 098 IntVLA probabilities = new IntVLA(128); 099 for(int iv = 0; iv < len; iv++ ) 100 { 101 v = working.get(iv); 102 w.clear(); 103 probabilities.clear(); 104 if(v.size <= 0) 105 { 106 v.add(1); 107 } 108 int vv, sum = 0; 109 final int vs = v.size; 110 OUTER: 111 for (int i = 0; i < vs; ++i) { 112 vv = v.get(i); 113 for (int j = 0; j < w.size; j++) { 114 if (w.get(j) == vv) { 115 probabilities.incr(j, 0x10000); 116 sum += 0x10000; 117 continue OUTER; 118 } 119 } 120 w.add(vv); 121 probabilities.add(0x10000); 122 sum += 0x10000; 123 } 124 int iAverage = (sum / w.size); 125 126 small.clear(); 127 large.clear(); 128 /* Populate the stacks with the input probabilities. */ 129 for (int i = 0; i < probabilities.size; i++) { 130 /* If the probability is below the average probability, then we add 131 * it to the small list; otherwise we add it to the large list. 132 */ 133 if (probabilities.get(i) >= iAverage) 134 large.add(i); 135 else 136 small.add(i); 137 } 138 139 processed[iv] = new int[w.size * 3]; 140 141 while (!small.isEmpty() && !large.isEmpty()) { 142 /* Get the index of the small and the large probabilities. */ 143 int less = small.pop(), less2 = less * 3; 144 int more = large.pop(); 145 146 /* These probabilities have not yet been scaled up to be such that 147 * sum/n is given weight 1.0. We do this here instead. 148 */ 149 processed[iv][less2] = (probabilities.size * probabilities.get(less)) / (sum >> 16); 150 processed[iv][less2+1] = w.get(less); 151 processed[iv][less2+2] = w.get(more); 152 vv = probabilities.get(less) - iAverage; 153 probabilities.incr(more, vv); 154 if (probabilities.get(more) >= iAverage) 155 large.add(more); 156 else 157 small.add(more); 158 } 159 int t; 160 while (!small.isEmpty()) 161 { 162 processed[iv][(t = small.pop()) * 3] = 0xFFFF; 163 processed[iv][t * 3 + 1] = processed[iv][t * 3 + 2] = w.get(t); 164 } 165 while (!large.isEmpty()) 166 { 167 processed[iv][(t = large.pop()) * 3] = 0xFFFF; 168 processed[iv][t * 3 + 1] = processed[iv][t * 3 + 2] = w.get(t); 169 } 170 } 171 } 172 173 /** 174 * After calling {@link #analyze(CharSequence)}, you can optionally call this to alter any words in this MarkovTextLimited that 175 * were used as a proper noun (determined by whether they were capitalized in the middle of a sentence), changing 176 * them to a ciphered version using the given {@link NaturalLanguageCipher}. Normally you would initialize a 177 * NaturalLanguageCipher with a {@link FakeLanguageGen} that matches the style you want for all names in this text, 178 * then pass that to this method during pre-processing (not necessarily at runtime, since this method isn't 179 * especially fast if the corpus was large). This method modifies this MarkovTextLimited in-place. 180 * @param translator a NaturalLanguageCipher that will be used to translate proper nouns in this MarkovTextLimited's word array 181 */ 182 public void changeNames(NaturalLanguageCipher translator) 183 { 184 String name; 185 PER_WORD: 186 for (int i = 5; i < words.length; i++) { 187 if(Category.Lu.contains((name = words[i]).charAt(0))) 188 { 189 for (int w = 5; w < words.length; w++) { 190 for (int p = 0; p < processed[w].length; p++) { 191 if (i == processed[w][++p] || i == processed[w][++p]) 192 { 193 words[i] = translator.cipher(name); 194 continue PER_WORD; 195 } 196 } 197 } 198 } 199 } 200 } 201 /** 202 * Generate a roughly-sentence-sized piece of text based on the previously analyzed corpus text (using 203 * {@link #analyze(CharSequence)}) that terminates when stop punctuation is used (".", "!", "?", or "..."), or once 204 * the length would be greater than 200 characters without encountering stop punctuation(it terminates such a 205 * sentence with "." or "..."). 206 * @param seed the seed for the random decisions this makes, as a long; any long can be used 207 * @return a String generated from the analyzed corpus text's word placement, usually a small sentence 208 */ 209 public String chain(long seed) { 210 return chain(seed, 200); 211 } 212 213 /** 214 * Generate a roughly-sentence-sized piece of text based on the previously analyzed corpus text (using 215 * {@link #analyze(CharSequence)}) that terminates when stop punctuation is used (".", "!", "?", or "...") or once 216 * the maxLength would be exceeded by any other words (it terminates such a sentence with "." or "..."). 217 * @param seed the seed for the random decisions this makes, as a long; any long can be used 218 * @param maxLength the maximum length for the generated String, in number of characters 219 * @return a String generated from the analyzed corpus text's word placement, usually a small sentence 220 */ 221 public String chain(long seed, int maxLength) { 222 int before = 0; 223 boolean later; 224 long state; 225 StringBuilder sb = new StringBuilder(1000); 226 int[] rf; 227 while (sb.length() < maxLength) { 228 if(sb.length() >= maxLength - 3) 229 { 230 sb.append('.'); 231 break; 232 } 233 later = (before != 0); 234 rf = processed[before]; 235 // This is LightRNG's algorithm to generate a random long given sequential states 236 state = ((state = ((state = ((seed += 0x9E3779B97F4A7C15L) ^ seed >>> 30) * 0xBF58476D1CE4E5B9L) ^ state >>> 27) * 0x94D049BB133111EBL) ^ state >>> 31); 237 // get a random int (using half the bits of our previously-calculated state) that is less than size 238 int column = (int) ((rf.length * (state & 0xFFFFFFFFL)) / 0x300000000L) * 3; // divide by 2^32, round down to multiple of 3 239 // use the other half of the bits of state to get a double, compare to probability and choose either the 240 // current column or the alias for that column based on that probability 241 //before = ((state >>> 33) > rf[column]) ? rf[column + 1] : rf[column + 2]; 242 if((state >>> 48) <= rf[column]) 243 before = rf[column + 1]; 244 else 245 before = rf[column + 2]; 246 if(before >= 5) 247 { 248 if(sb.length() + words[before].length() + 1 < maxLength) 249 { 250 if(later) 251 sb.append(' '); 252 sb.append(words[before]); 253 } 254 else 255 { 256 if(sb.length() + 3 <= maxLength) 257 sb.append("..."); 258 else 259 sb.append('.'); 260 break; 261 } 262 } 263 else if(before != 0) 264 { 265 sb.append(words[before]); 266 break; 267 } 268 } 269 return sb.toString(); 270 } 271 272 /** 273 * Returns a representation of this MarkovTextLimited as a String; use {@link #deserializeFromString(String)} to get a 274 * MarkovTextLimited back from this String. The {@link #words} and {@link #processed} fields must have been given values by 275 * either direct assignment, calling {@link #analyze(CharSequence)}, or building this MarkovTest with the 276 * aforementioned deserializeToString method. Uses spaces to separate words and a tab to separate the two fields. 277 * @return a String that can be used to store the analyzed words and frequencies in this MarkovTextLimited 278 */ 279 public String serializeToString() 280 { 281 return StringKit.join(" ", words) + "\t" + Converters.convertArrayInt2D.stringify(processed); 282 } 283 284 /** 285 * Recreates an already-analyzed MarkovTextLimited given a String produced by {@link #serializeToString()}. 286 * @param data a String returned by {@link #serializeToString()} 287 * @return a MarkovTextLimited that is ready to generate text with {@link #chain(long)} 288 */ 289 public static MarkovTextLimited deserializeFromString(String data) 290 { 291 int split = data.indexOf('\t'); 292 MarkovTextLimited markov = new MarkovTextLimited(); 293 markov.words = StringKit.split(data.substring(0, split), " "); 294 markov.processed = Converters.convertArrayInt2D.restore(data.substring(split + 1)); 295 return markov; 296 } 297 298 /** 299 * Copies the String array {@link #words} and the 2D jagged int array {@link #processed} into a new MarkovTextLimited. 300 * None of the arrays will be equivalent references, but the Strings (being immutable) will be the same objects in 301 * both MarkovTextLimited instances. This is primarily useful with {@link #changeNames(NaturalLanguageCipher)}, which can 302 * produce several variants on names given several initial copies produced with this method. 303 * @return a copy of this MarkovTextLimited 304 */ 305 public MarkovTextLimited copy() 306 { 307 MarkovTextLimited other = new MarkovTextLimited(); 308 other.words = new String[words.length]; 309 System.arraycopy(words, 0, other.words, 0, words.length); 310 other.processed = new int[processed.length][]; 311 int len; 312 for (int i = 0; i < processed.length; i++) { 313 other.processed[i] = new int[len = processed[i].length]; 314 System.arraycopy(processed[i], 0, other.processed[i], 0, len); 315 } 316 return other; 317 } 318}