Source code

001package squidpony;
002
003import regexodus.Category;
004import regexodus.Matcher;
005import regexodus.Pattern;
006import squidpony.squidmath.Arrangement;
007import squidpony.squidmath.IntVLA;
008
009import java.io.Serializable;
010import java.util.ArrayList;
011
012/**
013 * A simple Markov chain text generator; it is called "Limited" because it only can be used as an order-1 Markov chain,
014 * meaning only one prior word is looked at. To use, call {@link #analyze(CharSequence)} once on a large sample text,
015 * then you can call {@link #chain(long)} many times to get odd-sounding "remixes" of the sample text. For more natural
016 * output, you can use {@link MarkovText}, which is an order-2 Markov chain and so looks at two previous words. This is
017 * meant to allow easy serialization of the necessary data to call chain(); if you can store the {@link #words} and
018 * {@link #processed} arrays in some serialized form, then you can reassign them to the same fields to avoid calling
019 * analyze(). One way to do this conveniently is to use {@link #serializeToString()} after calling analyze() once and to
020 * save the resulting String; then, rather than calling analyze() again on future runs, you would call
021 * {@link #deserializeFromString(String)} to create the MarkovTextLimited without needing any repeated analysis.
022 * <br>
023 * Created by Tommy Ettinger on 1/30/2018.
024 */
025public class MarkovTextLimited implements Serializable {
026    private static final long serialVersionUID = 0L;
027
028    /**
029     * All words (case-sensitive and counting some punctuation as part of words) that this encountered during the latest
030     * call to {@link #analyze(CharSequence)}. Will be null if analyze() was never called.
031     */
032    public String[] words;
033    /**
034     * Complicated data that mixes probabilities and the indices of words in {@link #words}, generated during the latest
035     * call to {@link #analyze(CharSequence)}. This is a jagged 2D array. Will be null if analyze() was never called.
036     */
037    public int[][] processed;
038
039    private static final String INITIAL = "", FULL_STOP = ".", EXCLAMATION = "!", QUESTION = "?", ELLIPSIS = "...";
040    private static final Matcher matcher = Pattern.compile("\\.\\.\\.|[\\.!\\?]|[^\\.!\\?\"\\(\\)\\[\\]\\{\\}\\s]+").matcher();
041    public MarkovTextLimited()
042    {
043    }
044
045    /**
046     * This is the main necessary step before using a MarkovTextLimited; you must call this method at some point before you can
047     * call any other methods. You can serialize this MarkovTextLimited after calling to avoid needing to call this again on later
048     * runs, or even include serialized MarkovTextLimited objects with a game to only need to call this during pre-processing.
049     * This method analyzes the pairings of words in a (typically large) corpus text, including some punctuation as part
050     * of words and some kinds as their own "words." It only uses one preceding word to determine the subsequent word.
051     * When it finishes processing, it stores the results in {@link #words} and {@link #processed}, which allows other
052     * methods to be called (they will throw a {@link NullPointerException} if analyze() hasn't been called).
053     * @param corpus a typically-large sample text in the style that should be mimicked
054     */
055    public void analyze(CharSequence corpus)
056    {
057        Arrangement<String> body = new Arrangement<>(corpus.length() / 5 + 5);
058        body.add(INITIAL);
059        body.add(FULL_STOP);
060        body.add(EXCLAMATION);
061        body.add(QUESTION);
062        body.add(ELLIPSIS);
063        ArrayList<IntVLA> working = new ArrayList<>(corpus.length() / 5 + 5);
064        working.add(new IntVLA(128));
065        int[] links = {0};
066        working.add(new IntVLA(links));
067        working.add(new IntVLA(links));
068        working.add(new IntVLA(links));
069        working.add(new IntVLA(links));
070
071        matcher.setTarget(corpus);
072        int previous = 0, current;
073        while (matcher.find())
074        {
075            current = body.addOrIndex(matcher.group());
076            if(working.size() != body.size())
077            {
078                working.add(new IntVLA(16));
079            }
080            working.get(previous).add(current);
081            if(current > 0 && current < 5)
082            {
083                working.get(current).add(0);
084                previous = 0;
085            }
086            else
087                previous = current;
088        }
089        IntVLA w = working.get(previous), v;
090        if(w.size == 0) w.add(0);
091        final int len = working.size();
092        words = new String[len];
093        body.keySet().toArray(words);
094        processed = new int[len][];
095        w = new IntVLA(128);
096        IntVLA small = new IntVLA(128);
097        IntVLA large = new IntVLA(128);
098        IntVLA probabilities = new IntVLA(128);
099        for(int iv = 0; iv < len; iv++ )
100        {
101            v = working.get(iv);
102            w.clear();
103            probabilities.clear();
104            if(v.size <= 0)
105            {
106                v.add(1);
107            }
108            int vv, sum = 0;
109            final int vs = v.size;
110            OUTER:
111            for (int i = 0; i < vs; ++i) {
112                vv = v.get(i);
113                for (int j = 0; j < w.size; j++) {
114                    if (w.get(j) == vv) {
115                        probabilities.incr(j, 0x10000);
116                        sum += 0x10000;
117                        continue OUTER;
118                    }
119                }
120                w.add(vv);
121                probabilities.add(0x10000);
122                sum += 0x10000;
123            }
124            int iAverage = (sum / w.size);
125
126            small.clear();
127            large.clear();
128            /* Populate the stacks with the input probabilities. */
129            for (int i = 0; i < probabilities.size; i++) {
130                /* If the probability is below the average probability, then we add
131                 * it to the small list; otherwise we add it to the large list.
132                 */
133                if (probabilities.get(i) >= iAverage)
134                    large.add(i);
135                else
136                    small.add(i);
137            }
138
139            processed[iv] = new int[w.size * 3];
140
141            while (!small.isEmpty() && !large.isEmpty()) {
142                /* Get the index of the small and the large probabilities. */
143                int less = small.pop(), less2 = less * 3;
144                int more = large.pop();
145
146                /* These probabilities have not yet been scaled up to be such that
147                 * sum/n is given weight 1.0.  We do this here instead.
148                 */
149                processed[iv][less2] = (probabilities.size * probabilities.get(less)) / (sum >> 16);
150                processed[iv][less2+1] = w.get(less);
151                processed[iv][less2+2] = w.get(more);
152                vv = probabilities.get(less) - iAverage;
153                probabilities.incr(more, vv);
154                if (probabilities.get(more) >= iAverage)
155                    large.add(more);
156                else
157                    small.add(more);
158            }
159            int t;
160            while (!small.isEmpty())
161            {
162                processed[iv][(t = small.pop()) * 3] = 0xFFFF;
163                processed[iv][t * 3 + 1] = processed[iv][t * 3 + 2] = w.get(t);
164            }
165            while (!large.isEmpty())
166            {
167                processed[iv][(t = large.pop()) * 3] = 0xFFFF;
168                processed[iv][t * 3 + 1] = processed[iv][t * 3 + 2] = w.get(t);
169            }
170        }
171    }
172
173    /**
174     * After calling {@link #analyze(CharSequence)}, you can optionally call this to alter any words in this MarkovTextLimited that
175     * were used as a proper noun (determined by whether they were capitalized in the middle of a sentence), changing
176     * them to a ciphered version using the given {@link NaturalLanguageCipher}. Normally you would initialize a
177     * NaturalLanguageCipher with a {@link FakeLanguageGen} that matches the style you want for all names in this text,
178     * then pass that to this method during pre-processing (not necessarily at runtime, since this method isn't
179     * especially fast if the corpus was large). This method modifies this MarkovTextLimited in-place.
180     * @param translator a NaturalLanguageCipher that will be used to translate proper nouns in this MarkovTextLimited's word array
181     */
182    public void changeNames(NaturalLanguageCipher translator)
183    {
184        String name;
185        PER_WORD:
186        for (int i = 5; i < words.length; i++) {
187            if(Category.Lu.contains((name = words[i]).charAt(0)))
188            {
189                for (int w = 5; w < words.length; w++) {
190                    for (int p = 0; p < processed[w].length; p++) {
191                        if (i == processed[w][++p] || i == processed[w][++p])
192                        {
193                            words[i] = translator.cipher(name);
194                            continue PER_WORD;
195                        }
196                    }
197                }
198            }
199        }
200    }
201    /**
202     * Generate a roughly-sentence-sized piece of text based on the previously analyzed corpus text (using
203     * {@link #analyze(CharSequence)}) that terminates when stop punctuation is used (".", "!", "?", or "..."), or once
204     * the length would be greater than 200 characters without encountering stop punctuation(it terminates such a
205     * sentence with "." or "...").
206     * @param seed the seed for the random decisions this makes, as a long; any long can be used
207     * @return a String generated from the analyzed corpus text's word placement, usually a small sentence
208     */
209    public String chain(long seed) {
210        return chain(seed, 200);
211    }
212
213    /**
214     * Generate a roughly-sentence-sized piece of text based on the previously analyzed corpus text (using
215     * {@link #analyze(CharSequence)}) that terminates when stop punctuation is used (".", "!", "?", or "...") or once
216     * the maxLength would be exceeded by any other words (it terminates such a sentence with "." or "...").
217     * @param seed the seed for the random decisions this makes, as a long; any long can be used
218     * @param maxLength the maximum length for the generated String, in number of characters
219     * @return a String generated from the analyzed corpus text's word placement, usually a small sentence
220     */
221    public String chain(long seed, int maxLength) {
222        int before = 0;
223        boolean later;
224        long state;
225        StringBuilder sb = new StringBuilder(1000);
226        int[] rf;
227        while (sb.length() < maxLength) {
228            if(sb.length() >= maxLength - 3)
229            {
230                sb.append('.');
231                break;
232            }
233            later = (before != 0);
234            rf = processed[before];
235            // This is LightRNG's algorithm to generate a random long given sequential states
236            state = ((state = ((state = ((seed += 0x9E3779B97F4A7C15L) ^ seed >>> 30) * 0xBF58476D1CE4E5B9L) ^ state >>> 27) * 0x94D049BB133111EBL) ^ state >>> 31);
237            // get a random int (using half the bits of our previously-calculated state) that is less than size
238            int column = (int) ((rf.length * (state & 0xFFFFFFFFL)) / 0x300000000L) * 3; // divide by 2^32, round down to multiple of 3
239            // use the other half of the bits of state to get a double, compare to probability and choose either the
240            // current column or the alias for that column based on that probability
241            //before = ((state >>> 33) > rf[column]) ? rf[column + 1] : rf[column + 2];
242            if((state >>> 48) <= rf[column])
243                before = rf[column + 1];
244            else
245                before = rf[column + 2];
246            if(before >= 5)
247            {
248                if(sb.length() + words[before].length() + 1 < maxLength)
249                {
250                    if(later)
251                        sb.append(' ');
252                    sb.append(words[before]);
253                }
254                else
255                {
256                    if(sb.length() + 3 <= maxLength)
257                        sb.append("...");
258                    else
259                        sb.append('.');
260                    break;
261                }
262            }
263            else if(before != 0)
264            {
265                sb.append(words[before]);
266                break;
267            }
268        }
269        return sb.toString();
270    }
271
272    /**
273     * Returns a representation of this MarkovTextLimited as a String; use {@link #deserializeFromString(String)} to get a
274     * MarkovTextLimited back from this String. The {@link #words} and {@link #processed} fields must have been given values by
275     * either direct assignment, calling {@link #analyze(CharSequence)}, or building this MarkovTest with the
276     * aforementioned deserializeToString method. Uses spaces to separate words and a tab to separate the two fields.
277     * @return a String that can be used to store the analyzed words and frequencies in this MarkovTextLimited
278     */
279    public String serializeToString()
280    {
281        return StringKit.join(" ", words) + "\t" + Converters.convertArrayInt2D.stringify(processed);
282    }
283
284    /**
285     * Recreates an already-analyzed MarkovTextLimited given a String produced by {@link #serializeToString()}.
286     * @param data a String returned by {@link #serializeToString()}
287     * @return a MarkovTextLimited that is ready to generate text with {@link #chain(long)}
288     */
289    public static MarkovTextLimited deserializeFromString(String data)
290    {
291        int split = data.indexOf('\t');
292        MarkovTextLimited markov = new MarkovTextLimited();
293        markov.words = StringKit.split(data.substring(0, split), " ");
294        markov.processed = Converters.convertArrayInt2D.restore(data.substring(split + 1));
295        return markov;
296    }
297
298    /**
299     * Copies the String array {@link #words} and the 2D jagged int array {@link #processed} into a new MarkovTextLimited.
300     * None of the arrays will be equivalent references, but the Strings (being immutable) will be the same objects in
301     * both MarkovTextLimited instances. This is primarily useful with {@link #changeNames(NaturalLanguageCipher)}, which can
302     * produce several variants on names given several initial copies produced with this method.
303     * @return a copy of this MarkovTextLimited
304     */
305    public MarkovTextLimited copy()
306    {
307        MarkovTextLimited other = new MarkovTextLimited();
308        other.words = new String[words.length];
309        System.arraycopy(words, 0, other.words, 0, words.length);
310        other.processed = new int[processed.length][];
311        int len;
312        for (int i = 0; i < processed.length; i++) {
313            other.processed[i] = new int[len = processed[i].length];
314            System.arraycopy(processed[i], 0, other.processed[i], 0, len);
315        }
316        return other;
317    }
318}