001package squidpony; 002 003import squidpony.annotation.Beta; 004 005import java.nio.charset.StandardCharsets; 006import java.util.ArrayList; 007import java.util.HashMap; 008import java.util.HashSet; 009 010/** 011 * An experimental variant on LZSEncoding to encode byte arrays to compressed Strings, and decode them back. This always 012 * uses UTF-16-safe encoding, which means it does not use half of all possible chars in the compressed Strings but makes 013 * sure the Strings are valid UTF-16 (so they can be written to and read from file more safely). 014 * <br> 015 * This class does work, since it can read back what it writes, but it still hasn't been strenuously tested, and it 016 * could probably be optimized a lot. 017 * <br> 018 * Created by Tommy Ettinger on 1/11/2020. 019 */ 020@Beta 021public final class ByteStringEncoding { 022 private ByteStringEncoding(){} 023 024 public static String compress(byte[] uncompressed) { 025 if (uncompressed == null) return null; 026 if (uncompressed.length == 0) return ""; 027 final int bitsPerChar = 15, offset = 32; 028 int i, value; 029 HashMap<String, Integer> context_dictionary = new HashMap<>(); 030 HashSet<String> context_dictionaryToCreate = new HashSet<>(); 031 String context_c; 032 String context_wc; 033 String context_w = ""; 034 int context_enlargeIn = 2; // Compensate for the first entry which should not count 035 int context_dictSize = 3; 036 int context_numBits = 2; 037 StringBuilder context_data = new StringBuilder(uncompressed.length >>> 1); 038 int context_data_val = 0; 039 int context_data_position = 0; 040 int ii; 041 042 for (ii = 0; ii < uncompressed.length; ii++) { 043 context_c = new String(uncompressed, ii, 1, StandardCharsets.ISO_8859_1); 044 if (!context_dictionary.containsKey(context_c)) { 045 context_dictionary.put(context_c, context_dictSize++); 046 context_dictionaryToCreate.add(context_c); 047 } 048 049 context_wc = context_w + context_c; 050 if (context_dictionary.containsKey(context_wc)) { 051 context_w = context_wc; 052 } else { 053 if (context_dictionaryToCreate.contains(context_w)) { 054 value = (context_w.charAt(0) & 255); 055 for (i = 0; i < context_numBits; i++) { 056 context_data_val = (context_data_val << 1); 057 if (context_data_position == bitsPerChar - 1) { 058 context_data_position = 0; 059 context_data.append((char) (context_data_val + offset)); 060 context_data_val = 0; 061 } else { 062 context_data_position++; 063 } 064 } 065 for (i = 0; i < 8; i++) { 066 context_data_val = (context_data_val << 1) | (value & 1); 067 if (context_data_position == bitsPerChar - 1) { 068 context_data_position = 0; 069 context_data.append((char) (context_data_val + offset)); 070 context_data_val = 0; 071 } else { 072 context_data_position++; 073 } 074 value >>= 1; 075 } 076 context_enlargeIn--; 077 if (context_enlargeIn == 0) { 078 context_enlargeIn = 1 << context_numBits++; 079 } 080 context_dictionaryToCreate.remove(context_w); 081 } else { 082 value = context_dictionary.get(context_w); 083 for (i = 0; i < context_numBits; i++) { 084 context_data_val = (context_data_val << 1) | (value & 1); 085 if (context_data_position == bitsPerChar - 1) { 086 context_data_position = 0; 087 context_data.append((char) (context_data_val + offset)); 088 context_data_val = 0; 089 } else { 090 context_data_position++; 091 } 092 value >>= 1; 093 } 094 095 } 096 context_enlargeIn--; 097 if (context_enlargeIn == 0) { 098 context_enlargeIn = 1 << context_numBits++; 099 } 100 // Add wc to the dictionary. 101 context_dictionary.put(context_wc, context_dictSize++); 102 context_w = context_c; 103 } 104 } 105 106 // Output the code for w. 107 if (!context_w.isEmpty()) { 108 if (context_dictionaryToCreate.contains(context_w)) { 109// if (context_w.charAt(0) < 256) { 110 for (i = 0; i < context_numBits; i++) { 111 context_data_val = (context_data_val << 1); 112 if (context_data_position == bitsPerChar - 1) { 113 context_data_position = 0; 114 context_data.append((char) (context_data_val + offset)); 115 context_data_val = 0; 116 } else { 117 context_data_position++; 118 } 119 } 120 value = context_w.charAt(0); 121 for (i = 0; i < 8; i++) { 122 context_data_val = (context_data_val << 1) | (value & 1); 123 if (context_data_position == bitsPerChar - 1) { 124 context_data_position = 0; 125 context_data.append((char) (context_data_val + offset)); 126 context_data_val = 0; 127 } else { 128 context_data_position++; 129 } 130 value >>= 1; 131 } 132// } else { 133// value = 1; 134// for (i = 0; i < context_numBits; i++) { 135// context_data_val = (context_data_val << 1) | value; 136// if (context_data_position == bitsPerChar - 1) { 137// context_data_position = 0; 138// context_data.append((char) (context_data_val + offset)); 139// context_data_val = 0; 140// } else { 141// context_data_position++; 142// } 143// value = 0; 144// } 145// value = context_w.charAt(0); 146// for (i = 0; i < 16; i++) { 147// context_data_val = (context_data_val << 1) | (value & 1); 148// if (context_data_position == bitsPerChar - 1) { 149// context_data_position = 0; 150// context_data.append((char) (context_data_val + offset)); 151// context_data_val = 0; 152// } else { 153// context_data_position++; 154// } 155// value >>= 1; 156// } 157// } 158 159 context_dictionaryToCreate.remove(context_w); 160 } else { 161 value = context_dictionary.get(context_w); 162 for (i = 0; i < context_numBits; i++) { 163 context_data_val = (context_data_val << 1) | (value & 1); 164 if (context_data_position == bitsPerChar - 1) { 165 context_data_position = 0; 166 context_data.append((char) (context_data_val + offset)); 167 context_data_val = 0; 168 } else { 169 context_data_position++; 170 } 171 value >>= 1; 172 } 173 174 } 175 } 176 177 // Mark the end of the stream 178 value = 2; 179 for (i = 0; i < context_numBits; i++) { 180 context_data_val = (context_data_val << 1) | (value & 1); 181 if (context_data_position == bitsPerChar - 1) { 182 context_data_position = 0; 183 context_data.append((char) (context_data_val + offset)); 184 context_data_val = 0; 185 } else { 186 context_data_position++; 187 } 188 value >>= 1; 189 } 190 191 // Flush the last char 192 while (true) { 193 context_data_val = (context_data_val << 1); 194 if (context_data_position == bitsPerChar - 1) { 195 context_data.append((char) (context_data_val + offset)); 196 break; 197 } else 198 context_data_position++; 199 } 200 context_data.append(' '); 201 return context_data.toString(); 202 } 203 204 public static byte[] decompress(String compressed) { 205 if (compressed == null) 206 return null; 207 if (compressed.isEmpty()) 208 return new byte[0]; 209 final char[] getNextValue = compressed.toCharArray(); 210 final int length = getNextValue.length, resetValue = 16384, offset = -32; 211 ArrayList<String> dictionary = new ArrayList<>(); 212 int enlargeIn = 4, dictSize = 4, numBits = 3, position = resetValue, index = 1, resb, maxpower, power, 213 resultLength = 0; 214 String entry, w, c; 215 ArrayList<String> result = new ArrayList<>(); 216 char bits, val = (char) (getNextValue[0] + offset); 217 218 for (char i = 0; i < 3; i++) { 219 dictionary.add(i, String.valueOf(i)); 220 } 221 222 bits = 0; 223 maxpower = 2; 224 power = 0; 225 while (power != maxpower) { 226 resb = val & position; 227 position >>= 1; 228 if (position == 0) { 229 position = resetValue; 230 val = (char) (getNextValue[index++] + offset); 231 } 232 bits |= (resb > 0 ? 1 : 0) << power++; 233 } 234 235 switch (bits) { 236 case 0: 237 bits = 0; 238 maxpower = 8; 239 power = 0; 240 while (power != maxpower) { 241 resb = val & position; 242 position >>= 1; 243 if (position == 0) { 244 position = resetValue; 245 val = (char) (getNextValue[index++] + offset); 246 } 247 bits |= (resb > 0 ? 1 : 0) << power++; 248 } 249 c = String.valueOf(bits); 250 break; 251 case 1: 252 bits = 0; 253 maxpower = 16; 254 power = 0; 255 while (power != maxpower) { 256 resb = val & position; 257 position >>= 1; 258 if (position == 0) { 259 position = resetValue; 260 val = (char) (getNextValue[index++] + offset); 261 } 262 bits |= (resb > 0 ? 1 : 0) << power++; 263 } 264 c = String.valueOf(bits); 265 break; 266 default: 267 return new byte[0]; 268 } 269 dictionary.add(c); 270 w = c; 271 result.add(w); 272 resultLength += w.length(); 273 while (true) { 274 if (index > length) { 275 return new byte[0]; 276 } 277 int cc = 0; 278 maxpower = numBits; 279 power = 0; 280 while (power != maxpower) { 281 resb = val & position; 282 position >>= 1; 283 if (position == 0) { 284 position = resetValue; 285 val = (char) (getNextValue[index++] + offset); 286 } 287 cc |= (resb > 0 ? 1 : 0) << power++; 288 } 289 switch (cc) { 290 case 0: 291 bits = 0; 292 maxpower = 8; 293 power = 0; 294 while (power != maxpower) { 295 resb = val & position; 296 position >>= 1; 297 if (position == 0) { 298 position = resetValue; 299 val = (char) (getNextValue[index++] + offset); 300 } 301 bits |= (resb > 0 ? 1 : 0) << power++; 302 } 303 304 dictionary.add(String.valueOf(bits)); 305 cc = dictSize++; 306 enlargeIn--; 307 break; 308 case 1: 309 bits = 0; 310 maxpower = 16; 311 power = 0; 312 while (power != maxpower) { 313 resb = val & position; 314 position >>= 1; 315 if (position == 0) { 316 position = resetValue; 317 val = (char) (getNextValue[index++] + offset); 318 } 319 bits |= (resb > 0 ? 1 : 0) << power++; 320 } 321 dictionary.add(String.valueOf(bits)); 322 cc = dictSize++; 323 enlargeIn--; 324 break; 325 case 2: 326// byte[] bytes = new byte[resultLength]; 327// String r; 328// for (int i = 0, p = 0, n = result.size(); i < n; i++) { 329// r = result.get(i); 330// System.arraycopy(r.getBytes(StandardCharsets.ISO_8859_1), 0, bytes, p, r.length()); 331// p += r.length(); 332// } 333// return bytes; 334 StringBuilder sb = new StringBuilder(resultLength); 335 for (int i = 0, n = result.size(); i < n; i++) { 336 sb.append(result.get(i)); 337 } 338 return sb.toString().getBytes(StandardCharsets.ISO_8859_1); 339 } 340 341 if (enlargeIn == 0) { 342 enlargeIn = 1 << numBits; 343 numBits++; 344 } 345 346 if (cc < dictionary.size() && dictionary.get(cc) != null) { 347 entry = dictionary.get(cc); 348 } else { 349 if (cc == dictSize) { 350 entry = w + w.charAt(0); 351 } else { 352 return new byte[0]; 353 } 354 } 355 result.add(entry); 356 resultLength += entry.length(); 357 358 // Add w+entry[0] to the dictionary. 359 dictionary.add(w + entry.charAt(0)); 360 dictSize++; 361 enlargeIn--; 362 363 w = entry; 364 365 if (enlargeIn == 0) { 366 enlargeIn = 1 << numBits; 367 numBits++; 368 } 369 370 } 371 372 } 373 374}