001package squidpony;
002
003import squidpony.annotation.Beta;
004
005import java.nio.charset.StandardCharsets;
006import java.util.ArrayList;
007import java.util.HashMap;
008import java.util.HashSet;
009
010/**
011 * An experimental variant on LZSEncoding to encode byte arrays to compressed Strings, and decode them back. This always
012 * uses UTF-16-safe encoding, which means it does not use half of all possible chars in the compressed Strings but makes
013 * sure the Strings are valid UTF-16 (so they can be written to and read from file more safely).
014 * <br>
015 * This class does work, since it can read back what it writes, but it still hasn't been strenuously tested, and it
016 * could probably be optimized a lot.
017 * <br>
018 * Created by Tommy Ettinger on 1/11/2020.
019 */
020@Beta
021public final class ByteStringEncoding {
022    private ByteStringEncoding(){}
023
024    public static String compress(byte[] uncompressed) {
025        if (uncompressed == null) return null;
026        if (uncompressed.length == 0) return "";
027        final int bitsPerChar = 15, offset = 32;
028        int i, value;
029        HashMap<String, Integer> context_dictionary = new HashMap<>();
030        HashSet<String> context_dictionaryToCreate = new HashSet<>();
031        String context_c;
032        String context_wc;
033        String context_w = "";
034        int context_enlargeIn = 2; // Compensate for the first entry which should not count
035        int context_dictSize = 3;
036        int context_numBits = 2;
037        StringBuilder context_data = new StringBuilder(uncompressed.length >>> 1);
038        int context_data_val = 0;
039        int context_data_position = 0;
040        int ii;
041
042        for (ii = 0; ii < uncompressed.length; ii++) {
043            context_c = new String(uncompressed, ii, 1, StandardCharsets.ISO_8859_1);
044            if (!context_dictionary.containsKey(context_c)) {
045                context_dictionary.put(context_c, context_dictSize++);
046                context_dictionaryToCreate.add(context_c);
047            }
048
049            context_wc = context_w + context_c;
050            if (context_dictionary.containsKey(context_wc)) {
051                context_w = context_wc;
052            } else {
053                if (context_dictionaryToCreate.contains(context_w)) {
054                    value = (context_w.charAt(0) & 255);
055                    for (i = 0; i < context_numBits; i++) {
056                        context_data_val = (context_data_val << 1);
057                        if (context_data_position == bitsPerChar - 1) {
058                            context_data_position = 0;
059                            context_data.append((char) (context_data_val + offset));
060                            context_data_val = 0;
061                        } else {
062                            context_data_position++;
063                        }
064                    }
065                    for (i = 0; i < 8; i++) {
066                        context_data_val = (context_data_val << 1) | (value & 1);
067                        if (context_data_position == bitsPerChar - 1) {
068                            context_data_position = 0;
069                            context_data.append((char) (context_data_val + offset));
070                            context_data_val = 0;
071                        } else {
072                            context_data_position++;
073                        }
074                        value >>= 1;
075                    }
076                    context_enlargeIn--;
077                    if (context_enlargeIn == 0) {
078                        context_enlargeIn = 1 << context_numBits++;
079                    }
080                    context_dictionaryToCreate.remove(context_w);
081                } else {
082                    value = context_dictionary.get(context_w);
083                    for (i = 0; i < context_numBits; i++) {
084                        context_data_val = (context_data_val << 1) | (value & 1);
085                        if (context_data_position == bitsPerChar - 1) {
086                            context_data_position = 0;
087                            context_data.append((char) (context_data_val + offset));
088                            context_data_val = 0;
089                        } else {
090                            context_data_position++;
091                        }
092                        value >>= 1;
093                    }
094
095                }
096                context_enlargeIn--;
097                if (context_enlargeIn == 0) {
098                    context_enlargeIn = 1 << context_numBits++;
099                }
100                // Add wc to the dictionary.
101                context_dictionary.put(context_wc, context_dictSize++);
102                context_w = context_c;
103            }
104        }
105
106        // Output the code for w.
107        if (!context_w.isEmpty()) {
108            if (context_dictionaryToCreate.contains(context_w)) {
109//                if (context_w.charAt(0) < 256) {
110                for (i = 0; i < context_numBits; i++) {
111                    context_data_val = (context_data_val << 1);
112                    if (context_data_position == bitsPerChar - 1) {
113                        context_data_position = 0;
114                        context_data.append((char) (context_data_val + offset));
115                        context_data_val = 0;
116                    } else {
117                        context_data_position++;
118                    }
119                }
120                value = context_w.charAt(0);
121                for (i = 0; i < 8; i++) {
122                    context_data_val = (context_data_val << 1) | (value & 1);
123                    if (context_data_position == bitsPerChar - 1) {
124                        context_data_position = 0;
125                        context_data.append((char) (context_data_val + offset));
126                        context_data_val = 0;
127                    } else {
128                        context_data_position++;
129                    }
130                    value >>= 1;
131                }
132//                } else {
133//                    value = 1;
134//                    for (i = 0; i < context_numBits; i++) {
135//                        context_data_val = (context_data_val << 1) | value;
136//                        if (context_data_position == bitsPerChar - 1) {
137//                            context_data_position = 0;
138//                            context_data.append((char) (context_data_val + offset));
139//                            context_data_val = 0;
140//                        } else {
141//                            context_data_position++;
142//                        }
143//                        value = 0;
144//                    }
145//                    value = context_w.charAt(0);
146//                    for (i = 0; i < 16; i++) {
147//                        context_data_val = (context_data_val << 1) | (value & 1);
148//                        if (context_data_position == bitsPerChar - 1) {
149//                            context_data_position = 0;
150//                            context_data.append((char) (context_data_val + offset));
151//                            context_data_val = 0;
152//                        } else {
153//                            context_data_position++;
154//                        }
155//                        value >>= 1;
156//                    }
157//                }
158
159                context_dictionaryToCreate.remove(context_w);
160            } else {
161                value = context_dictionary.get(context_w);
162                for (i = 0; i < context_numBits; i++) {
163                    context_data_val = (context_data_val << 1) | (value & 1);
164                    if (context_data_position == bitsPerChar - 1) {
165                        context_data_position = 0;
166                        context_data.append((char) (context_data_val + offset));
167                        context_data_val = 0;
168                    } else {
169                        context_data_position++;
170                    }
171                    value >>= 1;
172                }
173
174            }
175        }
176
177        // Mark the end of the stream
178        value = 2;
179        for (i = 0; i < context_numBits; i++) {
180            context_data_val = (context_data_val << 1) | (value & 1);
181            if (context_data_position == bitsPerChar - 1) {
182                context_data_position = 0;
183                context_data.append((char) (context_data_val + offset));
184                context_data_val = 0;
185            } else {
186                context_data_position++;
187            }
188            value >>= 1;
189        }
190
191        // Flush the last char
192        while (true) {
193            context_data_val = (context_data_val << 1);
194            if (context_data_position == bitsPerChar - 1) {
195                context_data.append((char) (context_data_val + offset));
196                break;
197            } else
198                context_data_position++;
199        }
200        context_data.append(' ');
201        return context_data.toString();
202    }
203
204    public static byte[] decompress(String compressed) {
205        if (compressed == null)
206            return null;
207        if (compressed.isEmpty())
208            return new byte[0];
209        final char[] getNextValue = compressed.toCharArray();
210        final int length = getNextValue.length, resetValue = 16384, offset = -32;
211        ArrayList<String> dictionary = new ArrayList<>();
212        int enlargeIn = 4, dictSize = 4, numBits = 3, position = resetValue, index = 1, resb, maxpower, power,
213                resultLength = 0;
214        String entry, w, c;
215        ArrayList<String> result = new ArrayList<>();
216        char bits, val = (char) (getNextValue[0] + offset);
217
218        for (char i = 0; i < 3; i++) {
219            dictionary.add(i, String.valueOf(i));
220        }
221
222        bits = 0;
223        maxpower = 2;
224        power = 0;
225        while (power != maxpower) {
226            resb = val & position;
227            position >>= 1;
228            if (position == 0) {
229                position = resetValue;
230                val = (char) (getNextValue[index++] + offset);
231            }
232            bits |= (resb > 0 ? 1 : 0) << power++;
233        }
234
235        switch (bits) {
236            case 0:
237                bits = 0;
238                maxpower = 8;
239                power = 0;
240                while (power != maxpower) {
241                    resb = val & position;
242                    position >>= 1;
243                    if (position == 0) {
244                        position = resetValue;
245                        val = (char) (getNextValue[index++] + offset);
246                    }
247                    bits |= (resb > 0 ? 1 : 0) << power++;
248                }
249                c = String.valueOf(bits);
250                break;
251            case 1:
252                bits = 0;
253                maxpower = 16;
254                power = 0;
255                while (power != maxpower) {
256                    resb = val & position;
257                    position >>= 1;
258                    if (position == 0) {
259                        position = resetValue;
260                        val = (char) (getNextValue[index++] + offset);
261                    }
262                    bits |= (resb > 0 ? 1 : 0) << power++;
263                }
264                c = String.valueOf(bits);
265                break;
266            default:
267                return new byte[0];
268        }
269        dictionary.add(c);
270        w = c;
271        result.add(w);
272        resultLength += w.length();
273        while (true) {
274            if (index > length) {
275                return new byte[0];
276            }
277            int cc = 0;
278            maxpower = numBits;
279            power = 0;
280            while (power != maxpower) {
281                resb = val & position;
282                position >>= 1;
283                if (position == 0) {
284                    position = resetValue;
285                    val = (char) (getNextValue[index++] + offset);
286                }
287                cc |= (resb > 0 ? 1 : 0) << power++;
288            }
289            switch (cc) {
290                case 0:
291                    bits = 0;
292                    maxpower = 8;
293                    power = 0;
294                    while (power != maxpower) {
295                        resb = val & position;
296                        position >>= 1;
297                        if (position == 0) {
298                            position = resetValue;
299                            val = (char) (getNextValue[index++] + offset);
300                        }
301                        bits |= (resb > 0 ? 1 : 0) << power++;
302                    }
303
304                    dictionary.add(String.valueOf(bits));
305                    cc = dictSize++;
306                    enlargeIn--;
307                    break;
308                case 1:
309                    bits = 0;
310                    maxpower = 16;
311                    power = 0;
312                    while (power != maxpower) {
313                        resb = val & position;
314                        position >>= 1;
315                        if (position == 0) {
316                            position = resetValue;
317                            val = (char) (getNextValue[index++] + offset);
318                        }
319                        bits |= (resb > 0 ? 1 : 0) << power++;
320                    }
321                    dictionary.add(String.valueOf(bits));
322                    cc = dictSize++;
323                    enlargeIn--;
324                    break;
325                case 2:
326//                    byte[] bytes = new byte[resultLength];
327//                    String r;
328//                    for (int i = 0, p = 0, n = result.size(); i < n; i++) {
329//                        r = result.get(i);
330//                        System.arraycopy(r.getBytes(StandardCharsets.ISO_8859_1), 0, bytes, p, r.length());
331//                        p += r.length();
332//                    }
333//                    return bytes;
334                    StringBuilder sb = new StringBuilder(resultLength);
335                    for (int i = 0, n = result.size(); i < n; i++) {
336                        sb.append(result.get(i));
337                    }
338                    return sb.toString().getBytes(StandardCharsets.ISO_8859_1);
339            }
340
341            if (enlargeIn == 0) {
342                enlargeIn = 1 << numBits;
343                numBits++;
344            }
345
346            if (cc < dictionary.size() && dictionary.get(cc) != null) {
347                entry = dictionary.get(cc);
348            } else {
349                if (cc == dictSize) {
350                    entry = w + w.charAt(0);
351                } else {
352                    return new byte[0];
353                }
354            }
355            result.add(entry);
356            resultLength += entry.length();
357
358            // Add w+entry[0] to the dictionary.
359            dictionary.add(w + entry.charAt(0));
360            dictSize++;
361            enlargeIn--;
362
363            w = entry;
364
365            if (enlargeIn == 0) {
366                enlargeIn = 1 << numBits;
367                numBits++;
368            }
369
370        }
371
372    }
373
374}