Source code

001package squidpony;
002
003import squidpony.squidmath.Arrangement;
004import squidpony.squidmath.IntVLA;
005
006import java.io.Serializable;
007import java.util.ArrayList;
008import java.util.List;
009
010/**
011 * A simple Markov chain generator that works with Lists of some type instead of text like {@link MarkovTextLimited}.
012 * Call {@link #analyze(Iterable)} or {@link #analyze(Object[])} once on a large sample Iterable or array where
013 * sequences of items matter (this is called a corpus, and could be e.g. a List or an array), then you can call
014 * {@link #chain(long)} many times to get "remixes" of the sample Iterable/array as a List. This is meant to allow easy
015 * serialization of the necessary data to call chain(); if you can store the {@link #body} and {@link #processed} data
016 * structures in some serialized form, then you can reassign them to the same fields to avoid calling analyze(). This
017 * requires some way to serialize body, which is an {@link Arrangement} of T, and so T must be serializable in some way
018 * (not necessarily the {@link java.io.Serializable} interface, but possibly that).
019 * <br>
020 * Created by Tommy Ettinger on 2/26/2018.
021 */
022public class MarkovObject<T> implements Serializable {
023    private static final long serialVersionUID = 0L;
024
025    /**
026     * All unique T items that this encountered during the latest call to {@link #analyze(Iterable)}.
027     * Will be null if analyze() was never called.
028     */
029    public Arrangement<T> body;
030    /**
031     * Complicated data that mixes probabilities and the indices of items in {@link #body}, generated during the latest
032     * call to {@link #analyze(Iterable)}. Will be null if analyze() was never called.
033     */
034    public ArrayList<IntVLA> processed;
035    public ArrayList<IntVLA> raw;
036
037    public MarkovObject()
038    {
039    }
040
041    /**
042     * This is the main necessary step before using a MarkovObject; you must call this method at some point before you
043     * can call any other methods. This method analyzes the pairings of items in a (typically large) corpus Iterable.
044     * It only uses one preceding item to determine the subsequent word. It does not store any items as special stop
045     * terms, but it does use {@code null} to represent the start of a section (effectively treating any corpus as
046     * starting with null prepended), and will not produce null as output from {@link #chain(long)}. If null is
047     * encountered as part of corpus, it will be interpreted as a point to stop on and potentially start a new section.
048     * Since the last item in the corpus could have no known items to produce after it, the end of the corpus is treated
049     * as having null appended as well. When it finishes processing, it stores the results in {@link #body} and
050     * {@link #processed}, which allows other methods to be called (they will throw a {@link NullPointerException} if
051     * analyze() hasn't been called).
052     * <br>
053     * Unlike in {@link MarkovTextLimited}, you can analyze multiple corpus Iterables by calling this method more than once.
054     *
055     * @param corpus a typically-large sample Iterable in the style that should be mimicked
056     */
057    public void analyze(Iterable<T> corpus)
058    {
059        if(body == null || processed == null) {
060            body = new Arrangement<>(64);
061            body.add(null);
062            raw = new ArrayList<>(64);
063            raw.add(new IntVLA(128));
064            processed = new ArrayList<>(64);
065            processed.add(new IntVLA(128));
066        }
067        int previous = 0, current;
068        for (T item : corpus)
069        {
070            current = body.addOrIndex(item);
071            if(raw.size() != body.size())
072            {
073                raw.add(new IntVLA(16));
074            }
075            raw.get(previous).add(current);
076            previous = current;
077        }
078        raw.get(previous).add(0);
079        IntVLA w, v;
080        final int len = raw.size();
081        processed.ensureCapacity(len);
082        //processed = new int[len][];
083        w = new IntVLA(128);
084        IntVLA small = new IntVLA(128);
085        IntVLA large = new IntVLA(128);
086        IntVLA probabilities = new IntVLA(128);
087        for(int iv = 0; iv < len; iv++ )
088        {
089            v = raw.get(iv);
090            w.clear();
091            probabilities.clear();
092            if(v.size <= 0)
093            {
094                v.add(1);
095            }
096            int vv;
097            final long vs = v.size;
098            OUTER:
099            for (int i = 0; i < v.size; ++i) {
100                vv = v.get(i);
101                for (int j = 0; j < w.size; j++) {
102                    if (w.get(j) == vv) {
103                        probabilities.incr(j, 1);
104                        continue OUTER;
105                    }
106                }
107                w.add(vv);
108                probabilities.add(1);
109            }
110            final int iAverage = (int)((0x7FFFFFFFL * w.size) / v.size);
111
112            small.clear();
113            large.clear();
114            /* Populate the stacks with the input probabilities. */
115            for (int i = 0; i < probabilities.size; i++) {
116                /* If the probability is below the average probability, then we add
117                 * it to the small list; otherwise we add it to the large list.
118                 */
119                if (probabilities.get(i) * 0x7FFFFFFFL >= iAverage * vs)
120                    large.add(i);
121                else
122                    small.add(i);
123            }
124            if(processed.size() <= iv)
125                processed.add(v = new IntVLA(w.size * 3));
126            else
127            {
128                v = processed.get(iv);
129                v.clear();
130            }
131            final int[] va = v.setSize(w.size * 3);
132            //processed[iv] = new int[w.size * 3];
133
134            while (!small.isEmpty() && !large.isEmpty()) {
135                /* Get the index of the small and the large probabilities. */
136                int less = small.pop(), less2 = less * 3;
137                int more = large.pop();
138
139                /* These probabilities have not yet been scaled up to be such that
140                 * sum/n is given weight 1.0.  We do this here instead.
141                 */
142                va[less2] = iAverage * probabilities.get(less);
143                va[less2+1] = w.get(less);
144                va[less2+2] = w.get(more);
145
146                probabilities.incr(more, probabilities.get(less) - iAverage);
147
148                if (probabilities.get(more) >= iAverage)
149                    large.add(more);
150                else
151                    small.add(more);
152            }
153            int t;
154            while (!small.isEmpty())
155            {
156                va[(t = small.pop()) * 3] = 0x7FFFFFFF;
157                va[t * 3 + 1] = va[t * 3 + 2] = w.get(t);
158            }
159            while (!large.isEmpty())
160            {
161                va[(t = large.pop()) * 3] = 0x7FFFFFFF;
162                va[t * 3 + 1] = va[t * 3 + 2] = w.get(t);
163            }
164        }
165    }
166
167    /**
168     * This is the main necessary step before using a MarkovObject; you must call this method at some point before you
169     * can call any other methods. This method analyzes the pairings of items in a (typically large) corpus array of T.
170     * It only uses one preceding item to determine the subsequent word. It does not store any items as special stop
171     * terms, but it does use {@code null} to represent the start of a section (effectively treating any corpus as
172     * starting with null prepended), and will not produce null as output from {@link #chain(long)}. If null is
173     * encountered as part of corpus, it will be interpreted as a point to stop on and potentially start a new section.
174     * Since the last item in the corpus could have no known items to produce after it, the end of the corpus is treated
175     * as having null appended as well. When it finishes processing, it stores the results in {@link #body} and
176     * {@link #processed}, which allows other methods to be called (they will throw a {@link NullPointerException} if
177     * analyze() hasn't been called).
178     * <br>
179     * Unlike in {@link MarkovTextLimited}, you can analyze multiple corpus arrays by calling this method more than once.
180     *
181     * @param corpus a typically-large sample array of T in the style that should be mimicked
182     */
183    public void analyze(T[] corpus)
184    {
185        if(body == null || processed == null) {
186            body = new Arrangement<>(corpus.length * 3 >> 2);
187            body.add(null);
188            raw = new ArrayList<>(corpus.length * 3 >> 2);
189            raw.add(new IntVLA(128));
190            processed = new ArrayList<>(corpus.length * 3 >> 2);
191            processed.add(new IntVLA(128));
192        }
193        int previous = 0, current;
194        for (T item : corpus)
195        {
196            current = body.addOrIndex(item);
197            if(raw.size() != body.size())
198            {
199                raw.add(new IntVLA(16));
200            }
201            raw.get(previous).add(current);
202            previous = current;
203        }
204        raw.get(previous).add(0);
205        IntVLA w, v;
206        final int len = raw.size();
207        processed.ensureCapacity(len);
208        //processed = new int[len][];
209        w = new IntVLA(128);
210        IntVLA small = new IntVLA(128);
211        IntVLA large = new IntVLA(128);
212        IntVLA probabilities = new IntVLA(128);
213        for(int iv = 0; iv < len; iv++ )
214        {
215            v = raw.get(iv);
216            w.clear();
217            probabilities.clear();
218            if(v.size <= 0)
219            {
220                v.add(1);
221            }
222            int vv;
223            final long vs = v.size;
224            OUTER:
225            for (int i = 0; i < v.size; ++i) {
226                vv = v.get(i);
227                for (int j = 0; j < w.size; j++) {
228                    if (w.get(j) == vv) {
229                        probabilities.incr(j, 1);
230                        continue OUTER;
231                    }
232                }
233                w.add(vv);
234                probabilities.add(1);
235            }
236            final int iAverage = (int)((0x7FFFFFFFL * w.size) / v.size);
237
238            small.clear();
239            large.clear();
240            /* Populate the stacks with the input probabilities. */
241            for (int i = 0; i < probabilities.size; i++) {
242                /* If the probability is below the average probability, then we add
243                 * it to the small list; otherwise we add it to the large list.
244                 */
245                if (probabilities.get(i) * 0x7FFFFFFFL >= iAverage * vs)
246                    large.add(i);
247                else
248                    small.add(i);
249            }
250            if(processed.size() <= iv)
251                processed.add(v = new IntVLA(w.size * 3));
252            else
253            {
254                v = processed.get(iv);
255                v.clear();
256            }
257            final int[] va = v.setSize(w.size * 3);
258            //processed[iv] = new int[w.size * 3];
259
260            while (!small.isEmpty() && !large.isEmpty()) {
261                /* Get the index of the small and the large probabilities. */
262                int less = small.pop(), less2 = less * 3;
263                int more = large.pop();
264
265                /* These probabilities have not yet been scaled up to be such that
266                 * sum/n is given weight 1.0.  We do this here instead.
267                 */
268                va[less2] = iAverage * probabilities.get(less);
269                va[less2+1] = w.get(less);
270                va[less2+2] = w.get(more);
271
272                probabilities.incr(more, probabilities.get(less) - iAverage);
273
274                if (probabilities.get(more) >= iAverage)
275                    large.add(more);
276                else
277                    small.add(more);
278            }
279            int t;
280            while (!small.isEmpty())
281            {
282                va[(t = small.pop()) * 3] = 0x7FFFFFFF;
283                va[t * 3 + 1] = va[t * 3 + 2] = w.get(t);
284            }
285            while (!large.isEmpty())
286            {
287                va[(t = large.pop()) * 3] = 0x7FFFFFFF;
288                va[t * 3 + 1] = va[t * 3 + 2] = w.get(t);
289            }
290        }
291    }
292
293    /**
294     * Generates a 32-element List of T based on the given seed and previously analyzed corpus data (using
295     * {@link #analyze(Iterable)}). This can't stop before generating a chain of 32 items unless analyze() hasn't been
296     * called or it was called on an empty or invalid Iterable/array (i.e. all null).
297     * @param seed the seed for the random decisions this makes, as a long; any long can be used
298     * @return a 32-element T List generated from the analyzed corpus Iterable/array's pairings of items
299     */
300    public List<T> chain(long seed) {
301        return chain(seed, 32, false, new ArrayList<T>(32));
302    }
303
304    /**
305     * Adds T items to buffer to fill it up to maxLength, based on the given seed and previously analyzed corpus
306     * data (using {@link #analyze(Iterable)}). If buffer is already at least as long as maxLength, if analyze() hasn't
307     * been called or if it was called on an empty or invalid Iterable/array (i.e. all null), then this won't change
308     * buffer and will return it as-is. If null was present in the analyzed corpus along with other items and
309     * canStopEarly is true, then if null would be generated this will instead stop adding items to buffer and return
310     * buffer as it is. If canStopEarly was false in the last case, the generated null would be discarded and a value
311     * from the start of the corpus or following a null in the corpus would be used instead.
312     * @param seed the seed for the random decisions this makes, as a long; any long can be used
313     * @param maxLength the maximum length for the generated List, in items
314     * @param canStopEarly if true, this may add less than maxLength elements if null was present in the corpus
315     * @param buffer a List of T that will have elements added until maxLength is reached; if it already is larger than
316     *               maxLength this won't do anything
317     * @return buffer, after items were added to fill maxLength (or to fill less if this stopped early)
318     */
319    public List<T> chain(long seed, int maxLength, boolean canStopEarly, List<T> buffer) {
320        if(body == null || body.size() <= 1)
321            return buffer;
322        int before = 0;
323        long state;
324        IntVLA rf;
325        while (buffer.size() < maxLength) {
326            rf = processed.get(before);
327            // This is LightRNG's algorithm to generate a random long given sequential states
328            state = ((state = ((state = ((seed += 0x9E3779B97F4A7C15L) ^ seed >>> 30) * 0xBF58476D1CE4E5B9L) ^ state >>> 27) * 0x94D049BB133111EBL) ^ state >>> 31);
329
330            // get a random int (using half the bits of our previously-calculated state) that is less than size
331            int column = (int) ((rf.size * (state & 0xFFFFFFFFL)) / 0x300000000L) * 3; // divide by 2^32, round down to multiple of 3
332            // use the other half of the bits of state to get a double, compare to probability and choose either the
333            // current column or the alias for that column based on that probability
334            before = ((state >>> 33) <= rf.get(column)) ? rf.get(column + 1) : rf.get(column + 2);
335            if(before != 0)
336            {                                  
337                buffer.add(body.keyAt(before));
338            }
339            else if(canStopEarly)
340            {
341                break;
342            }
343        }
344        return buffer;
345    }
346    
347    /**
348     * Copies the T items in {@link #body} and the int-based data structure {@link #processed} into a new MarkovObject.
349     * None of the inner values, such as IntVLA values in processed, will be equivalent references, but the items in
350     * body will be the same objects in both MarkovObject instances.
351     * @return a copy of this MarkovObject
352     */
353    public MarkovObject<T> copy()
354    {
355        MarkovObject<T> other = new MarkovObject<>();
356        other.body = new Arrangement<>(body);
357        other.processed = new ArrayList<>(processed.size());
358        for (int i = 0; i < processed.size(); i++) {
359            other.processed.add(new IntVLA(processed.get(i)));
360        }
361        return other;
362    }
363}