001package squidpony;
002
003import regexodus.Category;
004import regexodus.Matcher;
005import regexodus.Pattern;
006import squidpony.squidmath.Arrangement;
007import squidpony.squidmath.IntIntOrderedMap;
008import squidpony.squidmath.IntVLA;
009
010import java.io.Serializable;
011import java.util.ArrayList;
012
013/**
014 * A simple Markov chain text generator; call {@link #analyze(CharSequence)} once on a large sample text, then you can
015 * call {@link #chain(long)} many times to get odd-sounding "remixes" of the sample text. This is meant to allow easy
016 * serialization of the necessary data to call chain(); if you can store the {@link #chars} and {@link #processed}
017 * arrays in some serialized form, then you can reassign them to the same fields to avoid calling analyze(). One way to
018 * do this conveniently is to use {@link #serializeToString()} after calling analyze() once and to save the resulting
019 * String; then, rather than calling analyze() again on future runs, you would call
020 * {@link #deserializeFromString(String)} to create the MarkovText without needing any repeated analysis.
021 * <br>
022 * Created by Tommy Ettinger on 1/30/2018.
023 */
024public class MarkovChar implements Serializable {
025    private static final long serialVersionUID = 1L;
026
027    /**
028     * All chars (case-sensitive and only counting chars that are letters in Unicode) that this encountered during the 
029     * latest call to {@link #analyze(CharSequence)}. Will be null if {@link #analyze(CharSequence)} was never called.
030     */
031    public char[] chars;
032
033    /**
034     * Map of all pairs of chars encountered to the position in the order they were encountered. Pairs are stored using
035     * their 16-bit {@link #chars} indices placed into the most-significant bits for the first word and the
036     * least-significant bits for the second word. The size of this IntIntOrderedMap is likely to be larger than the
037     * char array {@link #chars}, but should be equal to {@code processed.length}. Will be null if
038     * {@link #analyze(CharSequence)} was never called.
039     */
040    public IntIntOrderedMap pairs;
041    /**
042     * Complicated data that mixes probabilities of chars using their indices in {@link #chars} and the indices of char
043     * pairs in {@link #pairs}, generated during the latest call to {@link #analyze(CharSequence)}. This is a jagged 2D
044     * array. Will be null if {@link #analyze(CharSequence)} was never called.
045     */
046    public int[][] processed;
047
048    private static final Character INITIAL = '^', END = ' ';
049    private static final Matcher matcher = Pattern.compile("[\\p{L}']").matcher();
050    public MarkovChar()
051    {
052    }
053
054    /**
055     * This is the main necessary step before using a MarkovText; you must call this method at some point before you can
056     * call any other methods. You can serialize this MarkovText after calling to avoid needing to call this again on later
057     * runs, or even include serialized MarkovText objects with a game to only need to call this during pre-processing.
058     * This method analyzes the pairings of words in a (typically large) corpus text, including some punctuation as part
059     * of words and some kinds as their own "words." It only uses one preceding word to determine the subsequent word.
060     * When it finishes processing, it stores the results in {@link #chars} and {@link #processed}, which allows other
061     * methods to be called (they will throw a {@link NullPointerException} if analyze() hasn't been called).
062     * @param corpus a typically-large sample text in the style that should be mimicked
063     */
064    public void analyze(CharSequence corpus)
065    {
066        final int clen = corpus.length();
067        Arrangement<Character> body = new Arrangement<>((clen >> 4) + 5);
068        pairs = new IntIntOrderedMap(clen / 5 + 5);
069        ArrayList<IntVLA> working = new ArrayList<>(clen / 5 + 5);
070        body.add(INITIAL);
071        working.add(new IntVLA(128));
072        pairs.put(0, 0);
073        body.add(END);
074//        working.add(new IntVLA(links));
075
076        //matcher.setTarget(corpus);
077        int current, pair = 0, pre = 0, post;
078        for (int i = 0; i < clen; i++) {
079            char c = corpus.charAt(i);
080            if('\'' != c && !Category.L.contains(c))
081                c = END;
082            current = body.addOrIndex(c);
083            pair = pair << 16 | (current & 0xFFFF);
084            if(pair == 1)
085                continue;
086            post = pairs.putIfAbsent(pair, pairs.size());
087            if(working.size() != pairs.size())
088            {
089                working.add(new IntVLA(16));
090            }
091            working.get(pre).add(current);
092            if(current == 1)
093            {
094                working.get(post).add(0);
095                pair = 0;
096                pre = 0;
097            }
098            else
099            {
100                pre = post;
101            }
102        }
103        IntVLA w = working.get(pre), v;
104        if(w.size == 0) w.add(0);
105        final int len = body.size(), pairLen = working.size();
106        chars = new char[len];
107        for (int i = 0; i < len; i++) {
108            chars[i] = body.keyAt(i);
109        }
110
111        processed = new int[pairLen][];
112        w = new IntVLA(128);
113        IntVLA small = new IntVLA(128);
114        IntVLA large = new IntVLA(128);
115        IntVLA probabilities = new IntVLA(128);
116        for(int iv = 0; iv < pairLen; iv++ )
117        {
118            v = working.get(iv);
119            w.clear();
120            probabilities.clear();
121            if(v.size <= 0)
122            {
123                v.add(1);
124            }
125            int vv, sum = 0;
126            final int vs = v.size;
127            OUTER:
128            for (int i = 0; i < vs; ++i) {
129                vv = v.get(i);
130                for (int j = 0; j < w.size; j++) {
131                    if (w.get(j) == vv) {
132                        probabilities.incr(j, 0x10000);
133                        sum += 0x10000;
134                        continue OUTER;
135                    }
136                }
137                w.add(vv);
138                probabilities.add(0x10000);
139                sum += 0x10000;
140            }
141            int iAverage = (sum / w.size);
142
143            small.clear();
144            large.clear();
145            /* Populate the stacks with the input probabilities. */
146            for (int i = 0; i < probabilities.size; i++) {
147                /* If the probability is below the average probability, then we add
148                 * it to the small list; otherwise we add it to the large list.
149                 */
150                if (probabilities.get(i) >= iAverage)
151                    large.add(i);
152                else
153                    small.add(i);
154            }
155
156            processed[iv] = new int[w.size * 3];
157
158            while (!small.isEmpty() && !large.isEmpty()) {
159                /* Get the index of the small and the large probabilities. */
160                int less = small.pop(), less2 = less * 3;
161                int more = large.pop();
162
163                /* These probabilities have not yet been scaled up to be such that
164                 * sum/n is given weight 1.0.  We do this here instead.
165                 */
166                processed[iv][less2] = (probabilities.size * probabilities.get(less)) / (sum >> 16);
167                processed[iv][less2+1] = w.get(less);
168                processed[iv][less2+2] = w.get(more);
169                vv = probabilities.get(less) - iAverage;
170                probabilities.incr(more, vv);
171                if (probabilities.get(more) >= iAverage)
172                    large.add(more);
173                else
174                    small.add(more);
175            }
176            int t;
177            while (!small.isEmpty())
178            {
179                processed[iv][(t = small.pop()) * 3] = 0xFFFF;
180                processed[iv][t * 3 + 1] = processed[iv][t * 3 + 2] = w.get(t);
181            }
182            while (!large.isEmpty())
183            {
184                processed[iv][(t = large.pop()) * 3] = 0xFFFF;
185                processed[iv][t * 3 + 1] = processed[iv][t * 3 + 2] = w.get(t);
186            }
187        }
188    }
189
190    /**
191     * Generate a roughly-sentence-sized piece of text based on the previously analyzed corpus text (using
192     * {@link #analyze(CharSequence)}) that terminates when stop punctuation is used (".", "!", "?", or "..."), or once
193     * the length would be greater than 200 characters without encountering stop punctuation(it terminates such a
194     * sentence with "." or "...").
195     * @param seed the seed for the random decisions this makes, as a long; any long can be used
196     * @return a String generated from the analyzed corpus text's word placement, usually a small sentence
197     */
198    public String chain(long seed) {
199        return chain(seed, 200);
200    }
201
202    /**
203     * Generate a roughly-sentence-sized piece of text based on the previously analyzed corpus text (using
204     * {@link #analyze(CharSequence)}) that terminates when stop punctuation is used (".", "!", "?", or "...") or once
205     * the maxLength would be exceeded by any other words (it terminates such a sentence with "." or "...").
206     * @param seed the seed for the random decisions this makes, as a long; any long can be used
207     * @param maxLength the maximum length for the generated String, in number of characters
208     * @return a String generated from the analyzed corpus text's word placement, usually a small sentence
209     */
210    public String chain(long seed, int maxLength) {
211        int before, pair = 0;
212        boolean later;
213        long state;
214        StringBuilder sb = new StringBuilder(1000);
215        int[] rf;
216        while (sb.length() < maxLength) {
217            rf = processed[pairs.get(pair)];
218            // This is LightRNG's algorithm to generate a random long given sequential states
219            state = ((state = ((state = ((seed += 0x9E3779B97F4A7C15L) ^ seed >>> 30) * 0xBF58476D1CE4E5B9L) ^ state >>> 27) * 0x94D049BB133111EBL) ^ state >>> 31);
220            // get a random int (using half the bits of our previously-calculated state) that is less than size
221            int column = (int) ((rf.length * (state & 0xFFFFFFFFL)) / 0x300000000L) * 3; // divide by 2^32, round down to multiple of 3
222            // use the other half of the bits of state to get a double, compare to probability and choose either the
223            // current column or the alias for that column based on that probability
224            //before = ((state >>> 33) > rf[column]) ? rf[column + 1] : rf[column + 2];
225            if((state >>> 48) > rf[column])
226                before = rf[column + 1];
227            else
228                before = rf[column + 2];
229            if(before > 1)
230            {
231                if(sb.length() + 1 < maxLength)
232                {
233                    sb.append(chars[before]);
234                    pair = pair << 16 | (before & 0xFFFF);
235                }
236                else
237                {
238                    break;
239                }
240            }
241            else
242            {
243                break;
244            }
245            
246        }
247        return sb.toString();
248    }
249
250    /**
251     * Returns a representation of this MarkovText as a String; use {@link #deserializeFromString(String)} to get a
252     * MarkovText back from this String. The {@link #chars} and {@link #processed} fields must have been given values by
253     * either direct assignment, calling {@link #analyze(CharSequence)}, or building this MarkovTest with the
254     * aforementioned deserializeToString method. Uses spaces to separate words and a tab to separate the two fields.
255     * @return a String that can be used to store the analyzed words and frequencies in this MarkovText
256     */
257    public String serializeToString()
258    {
259        return String.valueOf(chars) + "\t" + StringKit.join(",", pairs.keysAsArray()) + "\t" + Converters.convertArrayInt2D.stringify(processed);
260    }
261
262    /**
263     * Recreates an already-analyzed MarkovText given a String produced by {@link #serializeToString()}.
264     * @param data a String returned by {@link #serializeToString()}
265     * @return a MarkovText that is ready to generate text with {@link #chain(long)}
266     */
267    public static MarkovChar deserializeFromString(String data)
268    {
269        int split = data.indexOf('\t');
270        MarkovChar markov = new MarkovChar();
271        markov.chars = data.substring(0, split).toCharArray();
272        int[] arr = Converters.convertArrayInt.restore(data.substring(split+1, split = data.indexOf('\t', split + 1)));
273        markov.pairs = new IntIntOrderedMap(arr, ArrayTools.range(arr.length));
274        markov.processed = Converters.convertArrayInt2D.restore(data.substring(split + 1));
275        return markov;
276    }
277
278    /**
279     * Copies the String array {@link #chars} and the 2D jagged int array {@link #processed} into a new MarkovText.
280     * None of the arrays will be equivalent references, but the Strings (being immutable) will be the same objects in
281     * both MarkovText instances.
282     * @return a copy of this MarkovText
283     */
284    public MarkovChar copy()
285    {
286        MarkovChar other = new MarkovChar();
287        other.chars = new char[chars.length];
288        System.arraycopy(chars, 0, other.chars, 0, chars.length);
289        other.processed = new int[processed.length][];
290        int len;
291        for (int i = 0; i < processed.length; i++) {
292            other.processed[i] = new int[len = processed[i].length];
293            System.arraycopy(processed[i], 0, other.processed[i], 0, len);
294        }
295        return other;
296    }
297}