Source code

001package squidpony;
002
003import regexodus.Category;
004import regexodus.Matcher;
005import regexodus.Pattern;
006import squidpony.squidmath.Arrangement;
007import squidpony.squidmath.IntIntOrderedMap;
008import squidpony.squidmath.IntVLA;
009
010import java.io.Serializable;
011import java.util.ArrayList;
012
013/**
014 * A simple Markov chain text generator; call {@link #analyze(CharSequence)} once on a large sample text, then you can
015 * call {@link #chain(long)} many times to get odd-sounding "remixes" of the sample text. This is an order-2 Markov
016 * chain, so it chooses the next word based on the previous two words; {@link MarkovTextLimited} is an order-1 Markov
017 * chain, and is faster, but produces lousy output because it only uses one previous word. This is meant to allow easy
018 * serialization of the necessary data to call chain(); if you can store the {@link #words} and {@link #processed}
019 * arrays in some serialized form, then you can reassign them to the same fields to avoid calling analyze(). One way to
020 * do this conveniently is to use {@link #serializeToString()} after calling analyze() once and to save the resulting
021 * String; then, rather than calling analyze() again on future runs, you would call
022 * {@link #deserializeFromString(String)} to create the MarkovText without needing any repeated analysis.
023 * <br>
024 * Created by Tommy Ettinger on 1/30/2018.
025 */
026public class MarkovText implements Serializable {
027    private static final long serialVersionUID = 1L;
028
029    /**
030     * All words (case-sensitive and counting some punctuation as part of words) that this encountered during the latest
031     * call to {@link #analyze(CharSequence)}. Will be null if {@link #analyze(CharSequence)} was never called.
032     */
033    public String[] words;
034
035    /**
036     * Map of all pairs of words encountered to the position in the order they were encountered. Pairs are stored using
037     * their 16-bit {@link #words} indices placed into the most-significant bits for the first word and the
038     * least-significant bits for the second word. The size of this IntIntOrderedMap is likely to be larger than the
039     * String array {@link #words}, but should be equal to {@code processed.length}. Will be null if
040     * {@link #analyze(CharSequence)} was never called.
041     */
042    public IntIntOrderedMap pairs;
043    /**
044     * Complicated data that mixes probabilities of words using their indices in {@link #words} and the indices of word
045     * pairs in {@link #pairs}, generated during the latest call to {@link #analyze(CharSequence)}. This is a jagged 2D
046     * array. Will be null if {@link #analyze(CharSequence)} was never called.
047     */
048    public int[][] processed;
049
050    private static final String INITIAL = "", FULL_STOP = ".", EXCLAMATION = "!", QUESTION = "?", ELLIPSIS = "...";
051    private static final Matcher matcher = Pattern.compile("\\.\\.\\.|[\\.!\\?]|[^\\.!\\?\"\\(\\)\\[\\]\\{\\}\\s]+").matcher();
052    public MarkovText()
053    {
054    }
055
056    /**
057     * This is the main necessary step before using a MarkovText; you must call this method at some point before you can
058     * call any other methods. You can serialize this MarkovText after calling to avoid needing to call this again on later
059     * runs, or even include serialized MarkovText objects with a game to only need to call this during pre-processing.
060     * This method analyzes the pairings of words in a (typically large) corpus text, including some punctuation as part
061     * of words and some kinds as their own "words." It only uses one preceding word to determine the subsequent word.
062     * When it finishes processing, it stores the results in {@link #words} and {@link #processed}, which allows other
063     * methods to be called (they will throw a {@link NullPointerException} if analyze() hasn't been called).
064     * @param corpus a typically-large sample text in the style that should be mimicked
065     */
066    public void analyze(CharSequence corpus)
067    {
068        Arrangement<String> body = new Arrangement<>((corpus.length() >> 4) + 5);
069        pairs = new IntIntOrderedMap(corpus.length() / 5 + 5);
070        ArrayList<IntVLA> working = new ArrayList<>(corpus.length() / 5 + 5);
071        body.add(INITIAL);
072        working.add(new IntVLA(128));
073        pairs.put(0, 0);
074        body.add(FULL_STOP);
075        body.add(EXCLAMATION);
076        body.add(QUESTION);
077        body.add(ELLIPSIS);
078//        working.add(new IntVLA(links));
079
080        matcher.setTarget(corpus);
081        int current, pair = 0, pre = 0, post;
082        while (matcher.find())
083        {
084            current = body.addOrIndex(matcher.group());
085            pair = pair << 16 | (current & 0xFFFF);
086            post = pairs.putIfAbsent(pair, pairs.size());
087            if(working.size() != pairs.size())
088            {
089                working.add(new IntVLA(16));
090            }
091            working.get(pre).add(current);
092            if(current > 0 && current < 5)
093            {
094                working.get(post).add(0);
095                pair = 0;
096                pre = 0;
097            }
098            else
099            {
100                pre = post;
101            }
102        }
103        IntVLA w = working.get(pre), v;
104        if(w.size == 0) w.add(0);
105        final int len = body.size(), pairLen = working.size();
106        words = new String[len];
107        body.keySet().toArray(words);
108
109        processed = new int[pairLen][];
110        w = new IntVLA(128);
111        IntVLA small = new IntVLA(128);
112        IntVLA large = new IntVLA(128);
113        IntVLA probabilities = new IntVLA(128);
114        for(int iv = 0; iv < pairLen; iv++ )
115        {
116            v = working.get(iv);
117            w.clear();
118            probabilities.clear();
119            if(v.size <= 0)
120            {
121                v.add(1);
122            }
123            int vv, sum = 0;
124            final int vs = v.size;
125            OUTER:
126            for (int i = 0; i < vs; ++i) {
127                vv = v.get(i);
128                for (int j = 0; j < w.size; j++) {
129                    if (w.get(j) == vv) {
130                        probabilities.incr(j, 0x10000);
131                        sum += 0x10000;
132                        continue OUTER;
133                    }
134                }
135                w.add(vv);
136                probabilities.add(0x10000);
137                sum += 0x10000;
138            }
139            int iAverage = (sum / w.size);
140
141            small.clear();
142            large.clear();
143            /* Populate the stacks with the input probabilities. */
144            for (int i = 0; i < probabilities.size; i++) {
145                /* If the probability is below the average probability, then we add
146                 * it to the small list; otherwise we add it to the large list.
147                 */
148                if (probabilities.get(i) >= iAverage)
149                    large.add(i);
150                else
151                    small.add(i);
152            }
153
154            processed[iv] = new int[w.size * 3];
155
156            while (!small.isEmpty() && !large.isEmpty()) {
157                /* Get the index of the small and the large probabilities. */
158                int less = small.pop(), less2 = less * 3;
159                int more = large.pop();
160
161                /* These probabilities have not yet been scaled up to be such that
162                 * sum/n is given weight 1.0.  We do this here instead.
163                 */
164                processed[iv][less2] = (probabilities.size * probabilities.get(less)) / (sum >> 16);
165                processed[iv][less2+1] = w.get(less);
166                processed[iv][less2+2] = w.get(more);
167                vv = probabilities.get(less) - iAverage;
168                probabilities.incr(more, vv);
169                if (probabilities.get(more) >= iAverage)
170                    large.add(more);
171                else
172                    small.add(more);
173            }
174            int t;
175            while (!small.isEmpty())
176            {
177                processed[iv][(t = small.pop()) * 3] = 0xFFFF;
178                processed[iv][t * 3 + 1] = processed[iv][t * 3 + 2] = w.get(t);
179            }
180            while (!large.isEmpty())
181            {
182                processed[iv][(t = large.pop()) * 3] = 0xFFFF;
183                processed[iv][t * 3 + 1] = processed[iv][t * 3 + 2] = w.get(t);
184            }
185        }
186    }
187
188    /**
189     * After calling {@link #analyze(CharSequence)}, you can optionally call this to alter any words in this MarkovText that
190     * were used as a proper noun (determined by whether they were capitalized in the middle of a sentence), changing
191     * them to a ciphered version using the given {@link NaturalLanguageCipher}. Normally you would initialize a
192     * NaturalLanguageCipher with a {@link FakeLanguageGen} that matches the style you want for all names in this text,
193     * then pass that to this method during pre-processing (not necessarily at runtime, since this method isn't
194     * especially fast if the corpus was large). This method modifies this MarkovText in-place.
195     * @param translator a NaturalLanguageCipher that will be used to translate proper nouns in this MarkovText's word array
196     */
197    public void changeNames(NaturalLanguageCipher translator)
198    {
199        String name;
200        PER_WORD:
201        for (int i = 5; i < words.length; i++) {
202            if(Category.Lu.contains((name = words[i]).charAt(0)))
203            {
204                for (int w = 5; w < words.length; w++) {
205                    for (int p = 0; p < processed[w].length; p++) {
206                        if (i == processed[w][++p] || i == processed[w][++p])
207                        {
208                            words[i] = translator.cipher(name);
209                            continue PER_WORD;
210                        }
211                    }
212                }
213            }
214        }
215    }
216    /**
217     * Generate a roughly-sentence-sized piece of text based on the previously analyzed corpus text (using
218     * {@link #analyze(CharSequence)}) that terminates when stop punctuation is used (".", "!", "?", or "..."), or once
219     * the length would be greater than 200 characters without encountering stop punctuation(it terminates such a
220     * sentence with "." or "...").
221     * @param seed the seed for the random decisions this makes, as a long; any long can be used
222     * @return a String generated from the analyzed corpus text's word placement, usually a small sentence
223     */
224    public String chain(long seed) {
225        return chain(seed, 200);
226    }
227
228    /**
229     * Generate a roughly-sentence-sized piece of text based on the previously analyzed corpus text (using
230     * {@link #analyze(CharSequence)}) that terminates when stop punctuation is used (".", "!", "?", or "...") or once
231     * the maxLength would be exceeded by any other words (it terminates such a sentence with "." or "...").
232     * @param seed the seed for the random decisions this makes, as a long; any long can be used
233     * @param maxLength the maximum length for the generated String, in number of characters
234     * @return a String generated from the analyzed corpus text's word placement, usually a small sentence
235     */
236    public String chain(long seed, int maxLength) {
237        int before, pair = 0;
238        boolean later;
239        long state;
240        StringBuilder sb = new StringBuilder(1000);
241        int[] rf;
242        while (sb.length() < maxLength) {
243            if(sb.length() >= maxLength - 3)
244            {
245                sb.append('.');
246                break;
247            }
248            later = (pair != 0);
249            rf = processed[pairs.get(pair)];
250            // This is LightRNG's algorithm to generate a random long given sequential states
251            state = ((state = ((state = ((seed += 0x9E3779B97F4A7C15L) ^ seed >>> 30) * 0xBF58476D1CE4E5B9L) ^ state >>> 27) * 0x94D049BB133111EBL) ^ state >>> 31);
252            // get a random int (using half the bits of our previously-calculated state) that is less than size
253            int column = (int) ((rf.length * (state & 0xFFFFFFFFL)) / 0x300000000L) * 3; // divide by 2^32, round down to multiple of 3
254            // use the other half of the bits of state to get a double, compare to probability and choose either the
255            // current column or the alias for that column based on that probability
256            //before = ((state >>> 33) > rf[column]) ? rf[column + 1] : rf[column + 2];
257            if((state >>> 48) > rf[column])
258                before = rf[column + 1];
259            else
260                before = rf[column + 2];
261            if(before >= 5)
262            {
263                if(sb.length() + words[before].length() + 1 < maxLength)
264                {
265                    if(later)
266                        sb.append(' ');
267                    sb.append(words[before]);
268                    pair = pair << 16 | (before & 0xFFFF);
269                }
270                else
271                {
272                    if(sb.length() + 3 <= maxLength)
273                        sb.append("...");
274                    else
275                        sb.append('.');
276                    break;
277                }
278            }
279            else if(before != 0)
280            {
281                sb.append(words[before]);
282                break;
283            }
284        }
285        return sb.toString();
286    }
287
288    /**
289     * Returns a representation of this MarkovText as a String; use {@link #deserializeFromString(String)} to get a
290     * MarkovText back from this String. The {@link #words} and {@link #processed} fields must have been given values by
291     * either direct assignment, calling {@link #analyze(CharSequence)}, or building this MarkovTest with the
292     * aforementioned deserializeToString method. Uses spaces to separate words and a tab to separate the two fields.
293     * @return a String that can be used to store the analyzed words and frequencies in this MarkovText
294     */
295    public String serializeToString()
296    {
297        return StringKit.join(" ", words) + "\t" + StringKit.join(",", pairs.keysAsArray()) + "\t" + Converters.convertArrayInt2D.stringify(processed);
298    }
299
300    /**
301     * Recreates an already-analyzed MarkovText given a String produced by {@link #serializeToString()}.
302     * @param data a String returned by {@link #serializeToString()}
303     * @return a MarkovText that is ready to generate text with {@link #chain(long)}
304     */
305    public static MarkovText deserializeFromString(String data)
306    {
307        int split = data.indexOf('\t');
308        MarkovText markov = new MarkovText();
309        markov.words = StringKit.split(data.substring(0, split), " ");
310        int[] arr = Converters.convertArrayInt.restore(data.substring(split+1, split = data.indexOf('\t', split + 1)));
311        markov.pairs = new IntIntOrderedMap(arr, ArrayTools.range(arr.length));
312        markov.processed = Converters.convertArrayInt2D.restore(data.substring(split + 1));
313        return markov;
314    }
315
316    /**
317     * Copies the String array {@link #words} and the 2D jagged int array {@link #processed} into a new MarkovText.
318     * None of the arrays will be equivalent references, but the Strings (being immutable) will be the same objects in
319     * both MarkovText instances. This is primarily useful with {@link #changeNames(NaturalLanguageCipher)}, which can
320     * produce several variants on names given several initial copies produced with this method.
321     * @return a copy of this MarkovText
322     */
323    public MarkovText copy()
324    {
325        MarkovText other = new MarkovText();
326        other.words = new String[words.length];
327        System.arraycopy(words, 0, other.words, 0, words.length);
328        other.processed = new int[processed.length][];
329        int len;
330        for (int i = 0; i < processed.length; i++) {
331            other.processed[i] = new int[len = processed[i].length];
332            System.arraycopy(processed[i], 0, other.processed[i], 0, len);
333        }
334        return other;
335    }
336}