001package squidpony; 002 003import regexodus.Category; 004import regexodus.Matcher; 005import regexodus.Pattern; 006import squidpony.squidmath.Arrangement; 007import squidpony.squidmath.IntIntOrderedMap; 008import squidpony.squidmath.IntVLA; 009 010import java.io.Serializable; 011import java.util.ArrayList; 012 013/** 014 * A simple Markov chain text generator; call {@link #analyze(CharSequence)} once on a large sample text, then you can 015 * call {@link #chain(long)} many times to get odd-sounding "remixes" of the sample text. This is meant to allow easy 016 * serialization of the necessary data to call chain(); if you can store the {@link #chars} and {@link #processed} 017 * arrays in some serialized form, then you can reassign them to the same fields to avoid calling analyze(). One way to 018 * do this conveniently is to use {@link #serializeToString()} after calling analyze() once and to save the resulting 019 * String; then, rather than calling analyze() again on future runs, you would call 020 * {@link #deserializeFromString(String)} to create the MarkovText without needing any repeated analysis. 021 * <br> 022 * Created by Tommy Ettinger on 1/30/2018. 023 */ 024public class MarkovChar implements Serializable { 025 private static final long serialVersionUID = 1L; 026 027 /** 028 * All chars (case-sensitive and only counting chars that are letters in Unicode) that this encountered during the 029 * latest call to {@link #analyze(CharSequence)}. Will be null if {@link #analyze(CharSequence)} was never called. 030 */ 031 public char[] chars; 032 033 /** 034 * Map of all pairs of chars encountered to the position in the order they were encountered. Pairs are stored using 035 * their 16-bit {@link #chars} indices placed into the most-significant bits for the first word and the 036 * least-significant bits for the second word. The size of this IntIntOrderedMap is likely to be larger than the 037 * char array {@link #chars}, but should be equal to {@code processed.length}. Will be null if 038 * {@link #analyze(CharSequence)} was never called. 039 */ 040 public IntIntOrderedMap pairs; 041 /** 042 * Complicated data that mixes probabilities of chars using their indices in {@link #chars} and the indices of char 043 * pairs in {@link #pairs}, generated during the latest call to {@link #analyze(CharSequence)}. This is a jagged 2D 044 * array. Will be null if {@link #analyze(CharSequence)} was never called. 045 */ 046 public int[][] processed; 047 048 private static final Character INITIAL = '^', END = ' '; 049 private static final Matcher matcher = Pattern.compile("[\\p{L}']").matcher(); 050 public MarkovChar() 051 { 052 } 053 054 /** 055 * This is the main necessary step before using a MarkovText; you must call this method at some point before you can 056 * call any other methods. You can serialize this MarkovText after calling to avoid needing to call this again on later 057 * runs, or even include serialized MarkovText objects with a game to only need to call this during pre-processing. 058 * This method analyzes the pairings of words in a (typically large) corpus text, including some punctuation as part 059 * of words and some kinds as their own "words." It only uses one preceding word to determine the subsequent word. 060 * When it finishes processing, it stores the results in {@link #chars} and {@link #processed}, which allows other 061 * methods to be called (they will throw a {@link NullPointerException} if analyze() hasn't been called). 062 * @param corpus a typically-large sample text in the style that should be mimicked 063 */ 064 public void analyze(CharSequence corpus) 065 { 066 final int clen = corpus.length(); 067 Arrangement<Character> body = new Arrangement<>((clen >> 4) + 5); 068 pairs = new IntIntOrderedMap(clen / 5 + 5); 069 ArrayList<IntVLA> working = new ArrayList<>(clen / 5 + 5); 070 body.add(INITIAL); 071 working.add(new IntVLA(128)); 072 pairs.put(0, 0); 073 body.add(END); 074// working.add(new IntVLA(links)); 075 076 //matcher.setTarget(corpus); 077 int current, pair = 0, pre = 0, post; 078 for (int i = 0; i < clen; i++) { 079 char c = corpus.charAt(i); 080 if('\'' != c && !Category.L.contains(c)) 081 c = END; 082 current = body.addOrIndex(c); 083 pair = pair << 16 | (current & 0xFFFF); 084 if(pair == 1) 085 continue; 086 post = pairs.putIfAbsent(pair, pairs.size()); 087 if(working.size() != pairs.size()) 088 { 089 working.add(new IntVLA(16)); 090 } 091 working.get(pre).add(current); 092 if(current == 1) 093 { 094 working.get(post).add(0); 095 pair = 0; 096 pre = 0; 097 } 098 else 099 { 100 pre = post; 101 } 102 } 103 IntVLA w = working.get(pre), v; 104 if(w.size == 0) w.add(0); 105 final int len = body.size(), pairLen = working.size(); 106 chars = new char[len]; 107 for (int i = 0; i < len; i++) { 108 chars[i] = body.keyAt(i); 109 } 110 111 processed = new int[pairLen][]; 112 w = new IntVLA(128); 113 IntVLA small = new IntVLA(128); 114 IntVLA large = new IntVLA(128); 115 IntVLA probabilities = new IntVLA(128); 116 for(int iv = 0; iv < pairLen; iv++ ) 117 { 118 v = working.get(iv); 119 w.clear(); 120 probabilities.clear(); 121 if(v.size <= 0) 122 { 123 v.add(1); 124 } 125 int vv, sum = 0; 126 final int vs = v.size; 127 OUTER: 128 for (int i = 0; i < vs; ++i) { 129 vv = v.get(i); 130 for (int j = 0; j < w.size; j++) { 131 if (w.get(j) == vv) { 132 probabilities.incr(j, 0x10000); 133 sum += 0x10000; 134 continue OUTER; 135 } 136 } 137 w.add(vv); 138 probabilities.add(0x10000); 139 sum += 0x10000; 140 } 141 int iAverage = (sum / w.size); 142 143 small.clear(); 144 large.clear(); 145 /* Populate the stacks with the input probabilities. */ 146 for (int i = 0; i < probabilities.size; i++) { 147 /* If the probability is below the average probability, then we add 148 * it to the small list; otherwise we add it to the large list. 149 */ 150 if (probabilities.get(i) >= iAverage) 151 large.add(i); 152 else 153 small.add(i); 154 } 155 156 processed[iv] = new int[w.size * 3]; 157 158 while (!small.isEmpty() && !large.isEmpty()) { 159 /* Get the index of the small and the large probabilities. */ 160 int less = small.pop(), less2 = less * 3; 161 int more = large.pop(); 162 163 /* These probabilities have not yet been scaled up to be such that 164 * sum/n is given weight 1.0. We do this here instead. 165 */ 166 processed[iv][less2] = (probabilities.size * probabilities.get(less)) / (sum >> 16); 167 processed[iv][less2+1] = w.get(less); 168 processed[iv][less2+2] = w.get(more); 169 vv = probabilities.get(less) - iAverage; 170 probabilities.incr(more, vv); 171 if (probabilities.get(more) >= iAverage) 172 large.add(more); 173 else 174 small.add(more); 175 } 176 int t; 177 while (!small.isEmpty()) 178 { 179 processed[iv][(t = small.pop()) * 3] = 0xFFFF; 180 processed[iv][t * 3 + 1] = processed[iv][t * 3 + 2] = w.get(t); 181 } 182 while (!large.isEmpty()) 183 { 184 processed[iv][(t = large.pop()) * 3] = 0xFFFF; 185 processed[iv][t * 3 + 1] = processed[iv][t * 3 + 2] = w.get(t); 186 } 187 } 188 } 189 190 /** 191 * Generate a roughly-sentence-sized piece of text based on the previously analyzed corpus text (using 192 * {@link #analyze(CharSequence)}) that terminates when stop punctuation is used (".", "!", "?", or "..."), or once 193 * the length would be greater than 200 characters without encountering stop punctuation(it terminates such a 194 * sentence with "." or "..."). 195 * @param seed the seed for the random decisions this makes, as a long; any long can be used 196 * @return a String generated from the analyzed corpus text's word placement, usually a small sentence 197 */ 198 public String chain(long seed) { 199 return chain(seed, 200); 200 } 201 202 /** 203 * Generate a roughly-sentence-sized piece of text based on the previously analyzed corpus text (using 204 * {@link #analyze(CharSequence)}) that terminates when stop punctuation is used (".", "!", "?", or "...") or once 205 * the maxLength would be exceeded by any other words (it terminates such a sentence with "." or "..."). 206 * @param seed the seed for the random decisions this makes, as a long; any long can be used 207 * @param maxLength the maximum length for the generated String, in number of characters 208 * @return a String generated from the analyzed corpus text's word placement, usually a small sentence 209 */ 210 public String chain(long seed, int maxLength) { 211 int before, pair = 0; 212 boolean later; 213 long state; 214 StringBuilder sb = new StringBuilder(1000); 215 int[] rf; 216 while (sb.length() < maxLength) { 217 rf = processed[pairs.get(pair)]; 218 // This is LightRNG's algorithm to generate a random long given sequential states 219 state = ((state = ((state = ((seed += 0x9E3779B97F4A7C15L) ^ seed >>> 30) * 0xBF58476D1CE4E5B9L) ^ state >>> 27) * 0x94D049BB133111EBL) ^ state >>> 31); 220 // get a random int (using half the bits of our previously-calculated state) that is less than size 221 int column = (int) ((rf.length * (state & 0xFFFFFFFFL)) / 0x300000000L) * 3; // divide by 2^32, round down to multiple of 3 222 // use the other half of the bits of state to get a double, compare to probability and choose either the 223 // current column or the alias for that column based on that probability 224 //before = ((state >>> 33) > rf[column]) ? rf[column + 1] : rf[column + 2]; 225 if((state >>> 48) > rf[column]) 226 before = rf[column + 1]; 227 else 228 before = rf[column + 2]; 229 if(before > 1) 230 { 231 if(sb.length() + 1 < maxLength) 232 { 233 sb.append(chars[before]); 234 pair = pair << 16 | (before & 0xFFFF); 235 } 236 else 237 { 238 break; 239 } 240 } 241 else 242 { 243 break; 244 } 245 246 } 247 return sb.toString(); 248 } 249 250 /** 251 * Returns a representation of this MarkovText as a String; use {@link #deserializeFromString(String)} to get a 252 * MarkovText back from this String. The {@link #chars} and {@link #processed} fields must have been given values by 253 * either direct assignment, calling {@link #analyze(CharSequence)}, or building this MarkovTest with the 254 * aforementioned deserializeToString method. Uses spaces to separate words and a tab to separate the two fields. 255 * @return a String that can be used to store the analyzed words and frequencies in this MarkovText 256 */ 257 public String serializeToString() 258 { 259 return String.valueOf(chars) + "\t" + StringKit.join(",", pairs.keysAsArray()) + "\t" + Converters.convertArrayInt2D.stringify(processed); 260 } 261 262 /** 263 * Recreates an already-analyzed MarkovText given a String produced by {@link #serializeToString()}. 264 * @param data a String returned by {@link #serializeToString()} 265 * @return a MarkovText that is ready to generate text with {@link #chain(long)} 266 */ 267 public static MarkovChar deserializeFromString(String data) 268 { 269 int split = data.indexOf('\t'); 270 MarkovChar markov = new MarkovChar(); 271 markov.chars = data.substring(0, split).toCharArray(); 272 int[] arr = Converters.convertArrayInt.restore(data.substring(split+1, split = data.indexOf('\t', split + 1))); 273 markov.pairs = new IntIntOrderedMap(arr, ArrayTools.range(arr.length)); 274 markov.processed = Converters.convertArrayInt2D.restore(data.substring(split + 1)); 275 return markov; 276 } 277 278 /** 279 * Copies the String array {@link #chars} and the 2D jagged int array {@link #processed} into a new MarkovText. 280 * None of the arrays will be equivalent references, but the Strings (being immutable) will be the same objects in 281 * both MarkovText instances. 282 * @return a copy of this MarkovText 283 */ 284 public MarkovChar copy() 285 { 286 MarkovChar other = new MarkovChar(); 287 other.chars = new char[chars.length]; 288 System.arraycopy(chars, 0, other.chars, 0, chars.length); 289 other.processed = new int[processed.length][]; 290 int len; 291 for (int i = 0; i < processed.length; i++) { 292 other.processed[i] = new int[len = processed[i].length]; 293 System.arraycopy(processed[i], 0, other.processed[i], 0, len); 294 } 295 return other; 296 } 297}