// Nakamichi_Oniyanma_Monsterdragonfly_Lexx_IPC_32-threads.c // CAUTION: This is just a decompression benchmark not the intended 32-threaded Nakamichi. // In this revision (2015-Aug-27) 'Trials' are added again and GCC 5.1.0 compilation (MinGW) is OK. // Benchy console tool measuring IPC for single-threaded code and 32-threaded code. // Useful comparison comes from the fact that codepath is the same for both single&multi compiles. // Requerements: // -AVX support (Vishera,Zambezi,i5-2500K or better); // 12GB free RAM (if it is virtual it makes no sense IPC-wise). // In fact this tool is specifically written to compare upcoming AMD 'Zen' versus all the rest. // How to compile? /* goto AVXless icl /O3 /QxSSE2 Nakamichi_Oniyanma_Monsterdragonfly_Lexx_IPC_32-threads.c -D_N_YMM -D_N_prefetch_4096 -D_icl_mumbo_jumbo_ -D_N_REALTIME /FAcs if exist Nakamichi_Oniyanma_Monsterdragonfly_Lexx_IPC_1-thread.exe del Nakamichi_Oniyanma_Monsterdragonfly_Lexx_IPC_1-thread.exe ren Nakamichi_Oniyanma_Monsterdragonfly_Lexx_IPC_32-threads.exe Nakamichi_Oniyanma_Monsterdragonfly_Lexx_IPC_1-thread.exe icl /O3 /QxSSE2 Nakamichi_Oniyanma_Monsterdragonfly_Lexx_IPC_32-threads.c -D_N_YMM -D_N_prefetch_4096 -D_icl_mumbo_jumbo_ /Qopenmp /Qopenmp-link:static -DCommence_OpenMP -D_N_REALTIME /FAcs goto solong :AVXless icl /O3 /QxSSE2 Nakamichi_Oniyanma_Monsterdragonfly_Lexx_IPC_32-threads.c -D_N_GP -D_N_prefetch_4096 -D_icl_mumbo_jumbo_ -D_N_REALTIME /FAcs if exist Nakamichi_Oniyanma_Monsterdragonfly_Lexx_IPC_1-thread.exe del Nakamichi_Oniyanma_Monsterdragonfly_Lexx_IPC_1-thread.exe ren Nakamichi_Oniyanma_Monsterdragonfly_Lexx_IPC_32-threads.exe Nakamichi_Oniyanma_Monsterdragonfly_Lexx_IPC_1-thread.exe icl /O3 /QxSSE2 Nakamichi_Oniyanma_Monsterdragonfly_Lexx_IPC_32-threads.c -D_N_GP -D_N_prefetch_4096 -D_icl_mumbo_jumbo_ /Qopenmp /Qopenmp-link:static -DCommence_OpenMP -D_N_REALTIME /FAcs :solong */ // gcc -O3 -mavx -fopenmp Nakamichi_Oniyanma_Monsterdragonfly_Lexx_IPC_32-threads.c -D_N_YMM -D_N_prefetch_4096 -D_gcc_mumbo_jumbo_ -DCommence_OpenMP -S -o Nakamichi_Oniyanma_Monsterdragonfly_Lexx_IPC_32-threads.S // gcc -O3 -mavx -fopenmp Nakamichi_Oniyanma_Monsterdragonfly_Lexx_IPC_32-threads.c -D_N_YMM -D_N_prefetch_4096 -D_gcc_mumbo_jumbo_ -DCommence_OpenMP Nakamichi_Oniyanma_Monsterdragonfly_Lexx_IPC_32-threads.exe // The snippet 47 (change 46 with 47 further below) instructions (GCC 5.1.0) from the 'Nakamichi_Oniyanma_Monsterdragonfly_Lexx_IPC_32-threads.S', Decompress function main loop: /* .L353: movl (%rdx), %r9d movl %edi, %ebx movq %rdx, %rbp movl %r9d, %ecx notl %ecx andl $3, %ecx sall $3, %ecx shrl %cl, %ebx movl %ebx, %ecx andl %r9d, %ecx movl %ecx, %r9d movl %ecx, %r10d andl $15, %r9d cmpl $12, %r9d sete %r9b shrl $4, %r10d movl %r10d, %ebx movzbl %r9b, %r9d subq $1, %r9 movq %rbx, %r11 movq %r9, %r12 notq %r11 notq %r12 addq %rax, %r11 andq %r9, %r11 andq %r12, %rbp andq %r12, %rbx vmovdqu 1(%r11,%rbp), %ymm0 movl %ecx, %ebp andl $12, %ecx leal 4(%rcx), %r11d andl $3, %ebp addl $1, %ebp vmovdqu %ymm0, (%rax) movl %ebp, %ecx shrl $2, %ecx sall %cl, %r11d leal 1(%r10), %ecx andq %r9, %r11 andq %rbp, %r9 andq %r12, %rcx addq %rbx, %r11 addq %rcx, %r9 addq %r11, %rax addq %r9, %rdx cmpq %rdx, %r8 ja .L353 */ // Benchy console tool measuring IPC for single-threaded code, for now. /* D:\_KAZE\Instructions_per_tick_during_branchless_decompression>Get_IPC.bat D:\_KAZE\Instructions_per_tick_during_branchless_decompression>Nakamichi_Oniyanma_Monsterdragonfly_Lexx_IPC.exe Autobiography_411-ebooks_Collection.tar.Nakamichi Nakamichi 'Oniyanma-Monsterdragonfly-Lexx_IPC', written by Kaze, based on Nobuo Ito's LZSS source, babealicious suggestion by m^2 enforced, muffinesque suggestion by Jim Dempsey enforced. Decompressing 91964279 bytes ... RAM-to-RAM performance: 159 MB/s. Info1: One second seems to have 1,000 clocks. Info2: This CPU seems to be working at 2,829 MHz. Decompression time: 4,627,837,148 ticks. TPI (Ticks_Per_Instructions_during_branchless_decompression) performance: 2.067 IPC (Instructions_Per_Clock_during_branchless_decompression) performance: 0.484 ; mark_description Intel(R) C++ Intel(R) 64 Compiler XE for applications running on Intel(R) 64, Version 15.0.0.108 Build 20140; ; mark_description -O3 -QxSSE2 -D_N_YMM -D_N_prefetch_4096 -D_icl_mumbo_jumbo_ -FAcs; .B14.3:: 00030 45 8b 38 mov r15d, DWORD PTR [r8] 00033 44 89 f9 mov ecx, r15d 00036 83 f1 03 xor ecx, 3 00039 41 bc ff ff ff ff mov r12d, -1 0003f c1 e1 03 shl ecx, 3 00042 bd 01 00 00 00 mov ebp, 1 00047 41 d3 ec shr r12d, cl 0004a 45 23 fc and r15d, r12d 0004d 45 33 e4 xor r12d, r12d 00050 45 89 fe mov r14d, r15d 00053 45 89 fb mov r11d, r15d 00056 41 83 e6 0f and r14d, 15 0005a 48 89 c1 mov rcx, rax 0005d 41 83 fe 0c cmp r14d, 12 00061 44 0f 44 e5 cmove r12d, ebp 00065 4c 89 c5 mov rbp, r8 00068 41 c1 eb 04 shr r11d, 4 0006c 41 ff cc dec r12d 0006f 45 89 da mov r10d, r11d 00072 4d 89 e6 mov r14, r12 00075 49 2b ca sub rcx, r10 00078 49 f7 d6 not r14 0007b 48 ff c9 dec rcx 0007e 49 23 ee and rbp, r14 00081 49 23 cc and rcx, r12 00084 41 ff c3 inc r11d 00087 4d 23 d6 and r10, r14 0008a 4d 23 de and r11, r14 0008d c5 fe 6f 44 29 01 vmovdqu ymm0, YMMWORD PTR [1+rcx+rbp] 00093 44 89 fd mov ebp, r15d 00096 83 e5 03 and ebp, 3 00099 41 83 e7 0c and r15d, 12 0009d ff c5 inc ebp 0009f 41 83 c7 04 add r15d, 4 000a3 89 e9 mov ecx, ebp 000a5 c1 e9 02 shr ecx, 2 000a8 41 d3 e7 shl r15d, cl 000ab 49 23 ec and rbp, r12 000ae 4d 23 fc and r15, r12 000b1 4c 03 dd add r11, rbp 000b4 4d 03 d7 add r10, r15 000b7 4d 03 c3 add r8, r11 000ba c5 fe 7f 00 vmovdqu YMMWORD PTR [rax], ymm0 000be 49 03 c2 add rax, r10 000c1 4d 3b c1 cmp r8, r9 000c4 0f 82 66 ff ff ff jb .B14.3 Note: Above 46 instructions are executed 29,763,921 times. D:\_KAZE\Instructions_per_tick_during_branchless_decompression> */ // Nakamichi is 100% FREE LZSS SUPERFAST decompressor. // Home of Nakamichi: www.sanmayce.com/Nakamichi/index.html // Also: http://www.sanmayce.com/Hayabusa/ // Also: http://www.codeproject.com/Articles/878593/Slowest-LZSS-Compressor-in-C // A benchy sub-variant showing IPCs. // Nakamichi_Oniyanma_Monsterdragonfly_Lexx.c, using 16B/4KB/1MB/256MB or (8-4)bit/(16-4)bit/(24-4)bit/(32-4)bit windows with 1/2/3/4 bytes long offsets. // 'Lexx' uses the unutilized 2bits from 'Hoshimi' thus achieving higher ratio, using YMM register slows down (for now) decompression but I don't care. // I am The Lexx. I am the most powerful weapon of destruction in the two universes. I was grown on the Cluster, which is ruled by His Shadow. // The food was good there. My captain is Stanley Tweedle. I blow up planets for him. // オニヤンマ oniyanma [noun] // Alternate Written Forms: // 馬大頭 oniyanma // 鬼蜻蜓 oniyanma // English Meaning(s) for オニヤンマ // 1. Anotogaster sieboldii (largest species of dragonfly in Japan) // Nakamichi_Loonette_Hoshimikou.c, using 16B/4KB/1MB/256MB or (8-4)bit/(16-4)bit/(24-4)bit/(32-4)bit windows with 1/2/3/4 bytes long offsets. // Tweaked: ML 4-8-12-16 are in use. // Nakamichi_Loonette_Hoshimi.c, using 4KB/1MB/256MB or (16-4)bit/(24-4)bit/(32-4)bit windows with 2/3/4 bytes long offsets. // Nakamichi_Hayabusa.c, using 32B/8KB/2MB or (8-3)bit/(16-3)bit/(24-3)bit windows with 1/2/3 bytes long offsets. Only 8 and 4 matchlengths. // Nakamichi_Tengu.c, using 16B/4KB/1MB or (8-4)bit/(16-4)bit/(24-4)bit windows with 1/2/3 bytes long offsets. // 中道 nakamichi [noun] // English Meaning(s) for 中道 // 1. road through the middle; middle road // Meanings for each kanji in 中道 // 中 in; inside; middle; mean; center // 道 road-way; street; district; journey; course; moral; teachings // 星 hoshi [noun] // English Meanings: // 1. star; any light-emitting (or reflecting) heavenly body (except for the sun and the moon) // 2. suspect (police slang) // 見 mi [noun, used as a suffix, noun] // English Meanings: // 1. looking; viewing // 公 kou [noun, suffix] // English Meanings: // 1. public matter; governmental matter // 2. prince; duke // 3. lord; sir // 4. familiar or derogatory suffix (after a name, etc.) // The formal name is 'Loonette Hoshimi Dame'. // The informal name is 'The Rabbitlet Girl Stargazing'. // Sir is an honorific address used in a number of situations in many anglophone cultures. // Equivalent terms of address to females are "ma'am" or "madam" in most cases, or in the case of a very young woman, girl, or unmarried woman who prefers to be addressed as such, "miss". // The equivalent term for a knighted woman or baronetess is Dame, or "Lady" for the wife of a knight or baronet. // Core 2 Q9550s: /* D:\_KAZE\Nakamichi_Loonette_Hoshimi_vs_enwik8>Nakamichi_Loonette-Hoshimi_branchless.exe enwik8.Nakamichi /bench Nakamichi 'Loonette-Hoshimi', written by Kaze, based on Nobuo Ito's LZSS source, babealicious suggestion by m^2 enforced, muffinesque suggestion by Jim Dempsey enforced. Decompressing 34968896 bytes ... RAM-to-RAM performance: 192 MB/s. Memory pool starting address: 0000000002620080 ... 64 byte aligned, OK Copying a 512MB block 1024 times i.e. 512GB READ + 512GB WRITTEN ... memcpy(): (512MB block); 524288MB copied in 193784 clocks or 2.706MB per clock RAM-to-RAM performance vs memcpy() ratio (bigger-the-better): 7% D:\_KAZE\Nakamichi_Loonette_Hoshimi_vs_enwik8>\sha1sum.exe enwik8 57b8363b814821dc9d47aa4d41f58733519076b2 enwik8 D:\_KAZE\Nakamichi_Loonette_Hoshimi_vs_enwik8>Nakamichi_Loonette_Hoshimi_XMM_PREFETCH_4096.exe enwik8.Nakamichi /bench Nakamichi 'Loonette-Hoshimi', written by Kaze, based on Nobuo Ito's LZSS source, babealicious suggestion by m^2 enforced, muffinesque suggestion by Jim Dempsey enforced. Decompressing 34968896 bytes ... RAM-to-RAM performance: 256 MB/s. Memory pool starting address: 0000000002710080 ... 64 byte aligned, OK Copying a 512MB block 1024 times i.e. 512GB READ + 512GB WRITTEN ... memcpy(): (512MB block); 524288MB copied in 193453 clocks or 2.710MB per clock RAM-to-RAM performance vs memcpy() ratio (bigger-the-better): 9% D:\_KAZE\Nakamichi_Loonette_Hoshimi_vs_enwik8>\sha1sum.exe enwik8 57b8363b814821dc9d47aa4d41f58733519076b2 enwik8 D:\_KAZE\Nakamichi_Loonette_Hoshimi_vs_enwik8> */ /* Shunkan twinkle v. intr. 1. To shine with slight, intermittent gleams, as distant lights or stars; flicker; glimmer. See Synonyms at flash. 2. To be bright or sparkling, as with merriment or delight: eyes that twinkled with joy. 3. To blink or wink the eyes. See Synonyms at blink. 4. To move about or to and fro rapidly and gracefully; flit. v. tr. To emit (light) in slight, intermittent gleams. n. 1. A slight, intermittent gleam of light; a sparkling flash; a glimmer. 2. A sparkle of merriment or delight in the eye. 3. A brief interval; a twinkling. 4. A rapid to-and-fro movement. [Middle English twinklen, from Old English twinclian, frequentative of twincan, to blink.] */ // How to compile: /* icl /O3 Nakamichi_Rakka.c -D_N_GP /FAcs ren Nakamichi_Rakka.cod Nakamichi_Rakka_GP.cod ren Nakamichi_Rakka.exe Nakamichi_Rakka_GP.exe icl /O3 /QxSSE2 Nakamichi_Rakka.c -D_N_XMM /FAcs ren Nakamichi_Rakka.cod Nakamichi_Rakka_XMM.cod ren Nakamichi_Rakka.exe Nakamichi_Rakka_XMM.exe icl /O3 /QxSSE2 Nakamichi_Rakka.c -D_N_YMM /FAcs ren Nakamichi_Rakka.cod Nakamichi_Rakka_YMM.cod ren Nakamichi_Rakka.exe Nakamichi_Rakka_YMM.exe */ // silk used in artwork : kenpon // Kinu - silk, Kinuno - silken // tiger : tora // Nakamichi_Rakka.c, using 16B/4KB/1MB/2MB or (8-4)bit/(16-4)bit/(24-4)bit/(32-4)bit windows with 1/2/3/4 bytes long offsets. // Nakamichi_Kokuen.c, using 16B/4KB/1MB/256MB or (8-4)bit/(16-4)bit/(24-4)bit/(32-4)bit windows with 1/2/3/4 bytes long offsets. // Nakamichi_Yoko.c, using 16B/4KB/1MB or (8-4)bit/(16-4)bit/(24-4)bit windows with 1/2/3 bytes long offsets. // Nakamichi_Kinutora.c, using 4KB/1MB or (16-4)bit/(24-4)bit windows with 2/3 bytes long offsets. // Nakamichi_Butsuhira.c, using 8KB/2MB or (16-3)bit/(24-3)bit windows with 2/3 bytes long offsets. // Nakamichi_Kinroba.c, using 4KB/1MB/256MB or 12bit/20bit/28bit windows with 2/3/4 bytes long offsets. // Nakamichi_Keigan.c, a branchless 'Kaibutsu' it is using 2MB/512MB or 21bit/29bit windows with 3/4 bytes long offsets. // Nakamichi_Washi.c, a branchless 'Kaibutsu' it is using 4MB window. /* D:\Nakamichi_Kinroba>Nakamichi_Kinroba_YMMless.exe dickens Nakamichi 'Kinroba', written by Kaze, based on Nobuo Ito's LZSS source, babealicious suggestion by m^2 enforced. Compressing 10192446 bytes ... \; Each rotation means 64KB are encoded; Done 100% NumberOfFullLiterals (lower-the-better): 81 NumberOfTinyMatchesSmallWindow (4): 226718 NumberOfShortMatchesSmallWindow (8): 89670 NumberOfMediumMatchesSmallWindow (16): 9084 NumberOfLongMatchesSmallWindow (32): 540 NumberOfTinyMatchesRegularWindow (4): 233284 NumberOfShortMatchesRegularWindow (8): 616461 NumberOfMediumMatchesRegularWindow (16): 45814 NumberOfLongMatchesRegularWindow (32): 3109 NumberOfTinyMatchesBigWindow (4): 0 NumberOfShortMatchesBigWindow (8): 114466 NumberOfMediumMatchesBigWindow (16): 39893 NumberOfLongMatchesBigWindow (32): 1861 RAM-to-RAM performance: 1 KB/s. D:\Nakamichi_Kinroba>dir 08/13/2014 06:49 AM 3,153,408 CalgaryCorpus.tar 08/13/2014 06:46 AM 1,346,675 CalgaryCorpus.tar.Nakamichi 08/13/2014 08:17 AM 10,192,446 dickens 08/13/2014 08:13 AM 4,111,829 dickens.Nakamichi 06/03/2014 07:35 PM 5,582,655 shaks12.txt 08/13/2014 08:50 AM 2,294,269 shaks12.txt.Nakamichi */ // Nakamichi_Kaiju.c, a branchless 'Kaibutsu' it is. // ML=9 // 68,352,060 enwik8.Kaiju.Nakamichi // ML=8 // 63,748,036 enwik8.Kaiju.Nakamichi // ML=7 // 59,771,603 enwik8.Kaiju.Nakamichi // ML=6 // 57,090,382 enwik8.Kaiju.Nakamichi // ML=5 // 56,188,976 // ML=4 // 58,954,436 enwik8.Kaiju.Nakamichi // Nakamichi_Kaibutsu.c, three small tweaks in Kaidanji, a good idea to remove shiftings altogether by m^2 was used. // Nakamichi_Kaidanji.c, is the very same '1-RSSBO_1GB_Wordfetcher_TRIAD_NOmemcpy_FIX_Kaidanji_FIX'. // Nakamichi, revision 1-RSSBO_1GB_Wordfetcher_TRIAD_NOmemcpy_FIX_Kaidanji_FIX, written by Kaze, babealicious suggestion by m^2 enforced. // Fixed! TO-DO: Known bug: the decompressed file sometimes has few additional bytes at the end. // Change #1: Now instead of looking first in the leftmost end of the window a "preemptive" search is done 16*8*128 bytes before the rightmost end of the window, there is the hottest (cachewise&matchwise) zone, as a side effect the compression speed is much higher. Maybe in the future I will try hashing as well. // Change #2: The full 16bits are used for offsets, 64KB window, that is. // Compile line: //icl /O3 Nakamichi_r1-RSSBO_1GB_Wordfetcher_TRIAD_NOmemcpy_FIX_Kaidanji_FIX.c -D_N_GP /FAcs //ren Nakamichi_r1-RSSBO_1GB_Wordfetcher_TRIAD_NOmemcpy_FIX_Kaidanji_FIX.cod Nakamichi_r1-RSSBO_1GB_Wordfetcher_TRIAD_NOmemcpy_FIX_Kaidanji_FIX_GP.cod //ren Nakamichi_r1-RSSBO_1GB_Wordfetcher_TRIAD_NOmemcpy_FIX_Kaidanji_FIX.exe Nakamichi_r1-RSSBO_1GB_Wordfetcher_TRIAD_NOmemcpy_FIX_Kaidanji_FIX_GP.exe //icl /O3 /QxSSE2 Nakamichi_r1-RSSBO_1GB_Wordfetcher_TRIAD_NOmemcpy_FIX_Kaidanji_FIX.c -D_N_XMM /FAcs //ren Nakamichi_r1-RSSBO_1GB_Wordfetcher_TRIAD_NOmemcpy_FIX_Kaidanji_FIX.cod Nakamichi_r1-RSSBO_1GB_Wordfetcher_TRIAD_NOmemcpy_FIX_Kaidanji_FIX_XMM.cod //ren Nakamichi_r1-RSSBO_1GB_Wordfetcher_TRIAD_NOmemcpy_FIX_Kaidanji_FIX.exe Nakamichi_r1-RSSBO_1GB_Wordfetcher_TRIAD_NOmemcpy_FIX_Kaidanji_FIX_XMM.exe //icl /O3 /QxAVX Nakamichi_r1-RSSBO_1GB_Wordfetcher_TRIAD_NOmemcpy_FIX_Kaidanji_FIX.c -D_N_YMM /FAcs //ren Nakamichi_r1-RSSBO_1GB_Wordfetcher_TRIAD_NOmemcpy_FIX_Kaidanji_FIX.cod Nakamichi_r1-RSSBO_1GB_Wordfetcher_TRIAD_NOmemcpy_FIX_Kaidanji_FIX_YMM.cod //ren Nakamichi_r1-RSSBO_1GB_Wordfetcher_TRIAD_NOmemcpy_FIX_Kaidanji_FIX.exe Nakamichi_r1-RSSBO_1GB_Wordfetcher_TRIAD_NOmemcpy_FIX_Kaidanji_FIX_YMM.exe // Nakamichi, revision 1-RSSBO_1GB_Wordfetcher_TRIAD_NOmemcpy_FIX, written by Kaze, babealicious suggestion by m^2 enforced. // Change #1: Nasty bug in Swampshine was fixed. // Change #2: Sanity check in compression section was added thus avoiding 'index-Min_Match_Length' going below 0. // Nakamichi, revision 1-RSSBO_1GB_Wordfetcher_TRIAD_NOmemcpy, written by Kaze, babealicious suggestion by m^2 enforced. // Change #1: 'memcpy' replaced by GP/XMM/YMM TRIADs. // Nakamichi, revision 1-RSSBO_1GB_Wordfetcher_TRIAD, written by Kaze. // Change #1: Decompression fetches WORD instead of BYTE+BYTE. // Change #2: Decompression stores three times 64bit instead of memcpy() for all transfers <=24 bytes. // Change #3: Fifteenth bit is used and then unused, 16KB -> 32KB -> 16KB. // 32KB window disappoints speedwise, also sizewise: /* D:\_KAZE\_KAZE_GOLD\Nakamichi_projectOLD\Nakamichi_vs_Yappy>Nakamichi_r1-RSSBO_1GB_15bit_Wordfetcher.exe enwik8 Nakamichi, revision 1-RSSBO_1GB_15bit_Wordfetcher, written by Kaze, based on Nobuo Ito's LZSS source. Compressing 100000000 bytes ... -; Each rotation means 128KB are encoded; Done 100% RAM-to-RAM performance: 130 KB/s. D:\_KAZE\_KAZE_GOLD\Nakamichi_projectOLD\Nakamichi_vs_Yappy>Nakamichi_r1-RSSBO_1GB_15bit_Wordfetcher.exe enwik8.Nakamichi Nakamichi, revision 1-RSSBO_1GB_15bit_Wordfetcher, written by Kaze, based on Nobuo Ito's LZSS source. Decompressing 65693566 bytes ... RAM-to-RAM performance: 358 MB/s. D:\_KAZE\_KAZE_GOLD\Nakamichi_projectOLD\Nakamichi_vs_Yappy>Nakamichi_r1-RSSBO_1GB_15bit_Wordfetcher.exe enwik9 Nakamichi, revision 1-RSSBO_1GB_15bit_Wordfetcher, written by Kaze, based on Nobuo Ito's LZSS source. Compressing 1000000000 bytes ... /; Each rotation means 128KB are encoded; Done 100% RAM-to-RAM performance: 150 KB/s. D:\_KAZE\_KAZE_GOLD\Nakamichi_projectOLD\Nakamichi_vs_Yappy>Nakamichi_r1-RSSBO_1GB_15bit_Wordfetcher.exe enwik9.Nakamichi Nakamichi, revision 1-RSSBO_1GB_15bit_Wordfetcher, written by Kaze, based on Nobuo Ito's LZSS source. Decompressing 609319736 bytes ... RAM-to-RAM performance: 379 MB/s. */ // 1-RSSBO_1GB vs 1-RSSBO_1GB_15bit_Wordfetcher (16KB/32KB respectively): // 069,443,065 vs 065,693,566 // 641,441,055 vs 609,319,736 // Nakamichi, revision 1-RSSBO_1GB, written by Kaze. // Based on Nobuo Ito's source, thanks Ito. // The main goal of Nakamichi is to allow supersimple and superfast decoding for English x-grams (mainly) in pure C, or not, heh-heh. // Natively Nakamichi is targeted as 64bit tool with 16 threads, helping Kazahana to traverse faster when I/O is not superior. // In short, Nakamichi is intended as x-gram decompressor. // Eightfold Path ~ the Buddhist path to nirvana, comprising eight aspects in which an aspirant must become practised; // eightfold way ~ (Physics), the grouping of hadrons into supermultiplets by means of SU(3)); (b) adverb to eight times the number or quantity: OE. // Note1: Fifteenth bit is not used, making the window wider by 1bit i.e. 32KB is not tempting, rather I think to use it as a flag: 8bytes/16bytes. // Note2: English x-grams are as English texts but more redundant, in other words they are phraselists in most cases, sometimes wordlists. // Note3: On OSHO.TXT, being a typical English text, Nakamichi's compression ratio is among the worsts: // 206,908,949 OSHO.TXT // 125,022,859 OSHO.TXT.Nakamichi // It struggles with English texts but decomprression speed is quite sweet (Core 2 T7500 2200MHz, 32bit code): // Nakamichi, revision 1-, written by Kaze. // Decompressing 125022859 bytes ... // RAM-to-RAM performance: 477681 KB/s. // Note4: Also I wanted to see how my 'Railgun_Swampshine_BailOut', being a HEAVYGUN i.e. with big overhead and latency, hits in a real-world application. // Quick notes on PAGODAs (the padded x-gram lists): // Every single word in English has its own PAGODA, in example below 'on' PAGODA is given (Kazahana_on.PAGODA-order-5.txt): // PAGODA order 5 (i.e. with 5 tiers) has 5*(5+1)/2=15 subtiers, they are concatenated and space-padded in order to form the pillar 'on': /* D:\_KAZE\Nakamichi_r1-RSSBO>dir \_GW\ka* 04/12/2014 05:07 AM 14 Kazahana_on.1-1.txt 04/12/2014 05:07 AM 1,635,389 Kazahana_on.2-1.txt 04/12/2014 05:07 AM 1,906,734 Kazahana_on.2-2.txt 04/12/2014 05:07 AM 10,891,415 Kazahana_on.3-1.txt 04/12/2014 05:07 AM 15,797,703 Kazahana_on.3-2.txt 04/12/2014 05:07 AM 20,419,280 Kazahana_on.3-3.txt 04/12/2014 05:07 AM 22,141,823 Kazahana_on.4-1.txt 04/12/2014 05:07 AM 36,002,113 Kazahana_on.4-2.txt 04/12/2014 05:07 AM 33,236,772 Kazahana_on.4-3.txt 04/12/2014 05:07 AM 33,902,425 Kazahana_on.4-4.txt 04/12/2014 05:07 AM 24,795,989 Kazahana_on.5-1.txt 04/12/2014 05:07 AM 30,766,220 Kazahana_on.5-2.txt 04/12/2014 05:07 AM 38,982,816 Kazahana_on.5-3.txt 04/12/2014 05:07 AM 38,089,575 Kazahana_on.5-4.txt 04/12/2014 05:07 AM 34,309,057 Kazahana_on.5-5.txt 04/12/2014 05:07 AM 846,351,894 Kazahana_on.PAGODA-order-5.txt D:\_KAZE\Nakamichi_r1-RSSBO>type \_GW\Kazahana_on.1-1.txt 9,999,999 on D:\_KAZE\Nakamichi_r1-RSSBO>type \_GW\Kazahana_on.2-1.txt 9,999,999 on_the 1,148,054 on_his 0,559,694 on_her 0,487,856 on_this 0,399,485 on_your 0,381,570 on_my 0,367,282 on_their ... D:\_KAZE\Nakamichi_r1-RSSBO>type \_GW\Kazahana_on.2-2.txt 0,545,191 based_on 0,397,408 and_on 0,334,266 go_on 0,329,561 went_on 0,263,035 was_on 0,246,332 it_on 0,229,041 down_on 0,202,151 going_on ... D:\_KAZE\Nakamichi_r1-RSSBO>type \_GW\Kazahana_on.5-5.txt 0,083,564 foundation_osho_s_books_on 0,012,404 medium_it_may_be_on 0,012,354 if_you_received_it_on 0,012,152 medium_they_may_be_on 0,012,144 agree_to_also_provide_on 0,012,139 a_united_states_copyright_on 0,008,067 we_are_constantly_working_on 0,008,067 questions_we_have_received_on 0,006,847 file_was_first_posted_on 0,006,441 of_we_are_already_on 0,006,279 you_received_this_ebook_on 0,005,865 you_received_this_etext_on 0,005,833 to_keep_an_eye_on ... D:\_KAZE\Nakamichi_r1-RSSBO>dir 04/12/2014 05:07 AM 846,351,894 Kazahana_on.PAGODA-order-5.txt D:\_KAZE\Nakamichi_r1-RSSBO>Nakamichi.exe Kazahana_on.PAGODA-order-5.txt Nakamichi, revision 1-RSSBO, written by Kaze. Compressing 846351894 bytes ... /; Each rotation means 128KB are encoded; Done 100% RAM-to-RAM performance: 512 KB/s. D:\_KAZE\Nakamichi_r1-RSSBO>dir 04/12/2014 05:07 AM 846,351,894 Kazahana_on.PAGODA-order-5.txt 04/15/2014 06:30 PM 293,049,398 Kazahana_on.PAGODA-order-5.txt.Nakamichi D:\_KAZE\Nakamichi_r1-RSSBO>Nakamichi.exe Kazahana_on.PAGODA-order-5.txt.Nakamichi Nakamichi, revision 1-RSSBO, written by Kaze. Decompressing 293049398 bytes ... RAM-to-RAM performance: 607 MB/s. D:\_KAZE\Nakamichi_r1-RSSBO>Yappy.exe Kazahana_on.PAGODA-order-5.txt 4096 YAPPY: [b 4K] bytes 846351894 -> 191149889 22.6% comp 33.8 MB/s uncomp 875.4 MB/s D:\_KAZE\Nakamichi_r1-RSSBO>Yappy.exe Kazahana_on.PAGODA-order-5.txt 8192 YAPPY: [b 8K] bytes 846351894 -> 184153244 21.8% comp 35.0 MB/s uncomp 898.3 MB/s D:\_KAZE\Nakamichi_r1-RSSBO>Yappy.exe Kazahana_on.PAGODA-order-5.txt 16384 YAPPY: [b 16K] bytes 846351894 -> 180650931 21.3% comp 28.8 MB/s uncomp 906.4 MB/s D:\_KAZE\Nakamichi_r1-RSSBO>Yappy.exe Kazahana_on.PAGODA-order-5.txt 32768 YAPPY: [b 32K] bytes 846351894 -> 178902966 21.1% comp 35.0 MB/s uncomp 906.4 MB/s D:\_KAZE\Nakamichi_r1-RSSBO>Yappy.exe Kazahana_on.PAGODA-order-5.txt 65536 YAPPY: [b 64K] bytes 846351894 -> 178027899 21.0% comp 34.5 MB/s uncomp 914.6 MB/s D:\_KAZE\Nakamichi_r1-RSSBO>Yappy.exe Kazahana_on.PAGODA-order-5.txt 131072 YAPPY: [b 128K] bytes 846351894 -> 177591807 21.0% comp 34.9 MB/s uncomp 906.4 MB/s D:\_KAZE\Nakamichi_r1-RSSBO> */ #ifdef Commence_OpenMP #include #endif #ifndef NULL #ifdef __cplusplus #define NULL 0 #else #define NULL ((void*)0) #endif #endif // During compilation use one of these, the granularity of the padded 'memcpy', 4x2x8/2x2x16/1x2x32/1x1x64 respectively as GP/XMM/YMM/ZMM, the maximum literal length reduced from 127 to 63: //#define _N_GP //#define _N_XMM //#define _N_YMM //#define _N_ZMM //#define _N_prefetch_64 //#define _N_prefetch_128 //#define _N_prefetch_4096 #include #include #include // uint64_t needed #include #include #ifdef _N_XMM #include // SSE2 intrinsics #include // SSE4.1 intrinsics #endif #ifdef _N_YMM #include // SSE2 intrinsics #include // SSE4.1 intrinsics #include // AVX intrinsics #endif #ifdef _N_ZMM #include // SSE2 intrinsics #include // SSE4.1 intrinsics #include // AVX intrinsics #include // AVX2 intrinsics, definitions and declarations for use with 512-bit compiler intrinsics. #endif #ifdef _N_XMM void SlowCopy128bit (const char *SOURCE, char *TARGET) { _mm_storeu_si128((__m128i *)(TARGET), _mm_loadu_si128((const __m128i *)(SOURCE))); } #endif #ifdef _N_YMM void SlowCopy128bit (const char *SOURCE, char *TARGET) { _mm_storeu_si128((__m128i *)(TARGET), _mm_loadu_si128((const __m128i *)(SOURCE))); } #endif #ifdef _N_ZMM void SlowCopy128bit (const char *SOURCE, char *TARGET) { _mm_storeu_si128((__m128i *)(TARGET), _mm_loadu_si128((const __m128i *)(SOURCE))); } #endif /* * Move Unaligned Packed Integer Values * **** VMOVDQU ymm1, m256 * **** VMOVDQU m256, ymm1 * Moves 256 bits of packed integer values from the source operand to the * destination */ //extern __m256i __ICL_INTRINCC _mm256_loadu_si256(__m256i const *); //extern void __ICL_INTRINCC _mm256_storeu_si256(__m256i *, __m256i); #ifdef _N_YMM void SlowCopy256bit (const char *SOURCE, char *TARGET) { _mm256_storeu_si256((__m256i *)(TARGET), _mm256_loadu_si256((const __m256i *)(SOURCE))); } #endif //extern __m512i __ICL_INTRINCC _mm512_loadu_si512(void const*); //extern void __ICL_INTRINCC _mm512_storeu_si512(void*, __m512i); #ifdef _N_ZMM void SlowCopy512bit (const char *SOURCE, char *TARGET) { _mm512_storeu_si512((__m512i *)(TARGET), _mm512_loadu_si512((const __m512i *)(SOURCE))); } #endif #ifndef NULL #define NULL ((void*)0) #endif // Comment it to see how slower 'BruteForce' is, for Wikipedia 100MB the ratio is 41KB/s versus 197KB/s. #define ReplaceBruteForceWithRailgunSwampshineBailOut // Change accordingly from command line: //#define _icl_mumbo_jumbo_ //#define _gcc_mumbo_jumbo_ // Change appropriately: #define _WIN32_ENVIRONMENT_ //#define _POSIX_ENVIRONMENT_ #if defined(_icl_mumbo_jumbo_) // GetRDTSC() taken from strchr.com #if defined(_M_IX86) unsigned long long __forceinline GetRDTSC(void) { __asm { ; Flush the pipeline XOR eax, eax CPUID ; Get RDTSC counter in edx:eax RDTSC } } #elif defined(_M_X64) unsigned long long __forceinline GetRDTSC(void) { return __rdtsc(); } #else unsigned long long __forceinline GetRDTSC(void) { return GetTickCount(); } #endif #endif #if defined(_gcc_mumbo_jumbo_) static __inline__ unsigned long long GetRDTSC() { unsigned hi, lo; __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi)); return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 ); } #endif // jacob navia /* #include long long (*rdtsc)(void); int main(void) { unsigned char fn[] = {0xf,0x31,0xc3}; long long cycles; rdtsc = (long long(*)(void)) fn; cycles = rdtsc(); printf("Cycles since machine start = %lld\n",cycles); return 0; } */ #ifdef _N_REALTIME // https://msdn.microsoft.com/en-us/library/windows/desktop/ms686219.aspx #include #include #include #endif /* int main( void ) { DWORD dwError, dwPriClass; if(!SetPriorityClass(GetCurrentProcess(), REALTIME_PRIORITY_CLASS)) { _tprintf(TEXT("Already REALTIME_PRIORITY\n")); goto Cleanup; } // Display priority class dwPriClass = GetPriorityClass(GetCurrentProcess()); _tprintf(TEXT("Current priority class is 0x%x\n"), dwPriClass); if (dwPriClass==0x00000100) printf("Current priority class is REALTIME_PRIORITY_CLASS.\n"); Cleanup: // Clean up ; return 0; } // IDLE_PRIORITY_CLASS // 0x00000040 // Process whose threads run only when the system is idle. The threads of the process are preempted by the threads of any process running in a higher priority class. An example is a screen saver. The idle-priority class is inherited by child processes. // NORMAL_PRIORITY_CLASS // 0x00000020 // Process with no special scheduling needs. // HIGH_PRIORITY_CLASS // 0x00000080 // Process that performs time-critical tasks that must be executed immediately. The threads of the process preempt the threads of normal or idle priority class processes. An example is the Task List, which must respond quickly when called by the user, regardless of the load on the operating system. Use extreme care when using the high-priority class, because a high-priority class application can use nearly all available CPU time. // REALTIME_PRIORITY_CLASS // 0x00000100 // Process that has the highest possible priority. The threads of the process preempt the threads of all other processes, including operating system processes performing important tasks. For example, a real-time process that executes for more than a very brief interval can cause disk caches not to flush or cause the mouse to be unresponsive. */ void x64toaKAZE ( /* stdcall is faster and smaller... Might as well use it for the helper. */ unsigned long long val, char *buf, unsigned radix, int is_neg ) { char *p; /* pointer to traverse string */ char *firstdig; /* pointer to first digit */ char temp; /* temp char */ unsigned digval; /* value of digit */ p = buf; if ( is_neg ) { *p++ = '-'; /* negative, so output '-' and negate */ val = (unsigned long long)(-(long long)val); } firstdig = p; /* save pointer to first digit */ do { digval = (unsigned) (val % radix); val /= radix; /* get next digit */ /* convert to ascii and store */ if (digval > 9) *p++ = (char) (digval - 10 + 'a'); /* a letter */ else *p++ = (char) (digval + '0'); /* a digit */ } while (val > 0); /* We now have the digit of the number in the buffer, but in reverse order. Thus we reverse them now. */ *p-- = '\0'; /* terminate string; p points to last digit */ do { temp = *p; *p = *firstdig; *firstdig = temp; /* swap *p and *firstdig */ --p; ++firstdig; /* advance to next two digits */ } while (firstdig < p); /* repeat until halfway */ } /* Actual functions just call conversion helper with neg flag set correctly, and return pointer to buffer. */ char * _i64toaKAZE ( long long val, char *buf, int radix ) { x64toaKAZE((unsigned long long)val, buf, radix, (radix == 10 && val < 0)); return buf; } char * _ui64toaKAZE ( unsigned long long val, char *buf, int radix ) { x64toaKAZE(val, buf, radix, 0); return buf; } char * _ui64toaKAZEzerocomma ( unsigned long long val, char *buf, int radix ) { char *p; char temp; int txpman; int pxnman; x64toaKAZE(val, buf, radix, 0); p = buf; do { } while (*++p != '\0'); p--; // p points to last digit // buf points to first digit buf[26] = 0; txpman = 1; pxnman = 0; do { if (buf <= p) { temp = *p; buf[26-txpman] = temp; pxnman++; p--; if (pxnman % 3 == 0) { txpman++; buf[26-txpman] = (char) (','); } } else { buf[26-txpman] = (char) ('0'); pxnman++; if (pxnman % 3 == 0) { txpman++; buf[26-txpman] = (char) (','); } } txpman++; } while (txpman <= 26); return buf; } char * _ui64toaKAZEcomma ( unsigned long long val, char *buf, int radix ) { char *p; char temp; int txpman; int pxnman; x64toaKAZE(val, buf, radix, 0); p = buf; do { } while (*++p != '\0'); p--; // p points to last digit // buf points to first digit buf[26] = 0; txpman = 1; pxnman = 0; while (buf <= p) { temp = *p; buf[26-txpman] = temp; pxnman++; p--; if (pxnman % 3 == 0 && buf <= p) { txpman++; buf[26-txpman] = (char) (','); } txpman++; } return buf+26-(txpman-1); } char * _ui64toaKAZEzerocomma4 ( unsigned long long val, char *buf, int radix ) { char *p; char temp; int txpman; int pxnman; x64toaKAZE(val, buf, radix, 0); p = buf; do { } while (*++p != '\0'); p--; // p points to last digit // buf points to first digit buf[26] = 0; txpman = 1; pxnman = 0; do { if (buf <= p) { temp = *p; buf[26-txpman] = temp; pxnman++; p--; if (pxnman % 4 == 0) { txpman++; buf[26-txpman] = (char) (','); } } else { buf[26-txpman] = (char) ('0'); pxnman++; if (pxnman % 4 == 0) { txpman++; buf[26-txpman] = (char) (','); } } txpman++; } while (txpman <= 26); return buf; } /* minimum signed 64 bit value */ //#define _I64_MIN (-9223372036854775807i64 - 1) /* maximum signed 64 bit value */ //#define _I64_MAX 9223372036854775807i64 /* maximum unsigned 64 bit value */ //#define _UI64_MAX 0xffffffffffffffffui64 /* minimum signed 128 bit value */ //#define _I128_MIN (-170141183460469231731687303715884105727i128 - 1) /* maximum signed 128 bit value */ //#define _I128_MAX 170141183460469231731687303715884105727i128 /* maximum unsigned 128 bit value */ //#define _UI128_MAX 0xffffffffffffffffffffffffffffffffui128 char llTOaDigits[27]; // 9,223,372,036,854,775,807: 1(sign or carry)+19(digits)+1('\0')+6(,) // below duplicates are needed because of one_line_invoking need different buffers. char llTOaDigits2[27]; // 9,223,372,036,854,775,807: 1(sign or carry)+19(digits)+1('\0')+6(,) char llTOaDigits3[27]; // 9,223,372,036,854,775,807: 1(sign or carry)+19(digits)+1('\0')+6(,) char llTOaDigits4[27]; // 9,223,372,036,854,775,807: 1(sign or carry)+19(digits)+1('\0')+6(,) unsigned char fn[] = {0xf,0x31,0xc3}; void SearchIntoSlidingWindow(unsigned int* ShortMediumLongOFFSET, unsigned int* retIndex, unsigned int* retMatch, char* refStart,char* refEnd,char* encStart,char* encEnd); unsigned int SlidingWindowVsLookAheadBuffer(char* refStart, char* refEnd, char* encStart, char* encEnd); unsigned int Compress(char* ret, char* src, unsigned int srcSize); uint64_t Decompress(unsigned char* ret, unsigned char* src, uint64_t srcSize); char * Railgun_Swampshine_BailOut(char * pbTarget, char * pbPattern, uint32_t cbTarget, uint32_t cbPattern); char * Railgun_Doublet (char * pbTarget, char * pbPattern, uint32_t cbTarget, uint32_t cbPattern); uint64_t Decompress001 (unsigned char* ret, unsigned char* src, uint64_t srcSize); uint64_t Decompress002 (unsigned char* ret, unsigned char* src, uint64_t srcSize); uint64_t Decompress003 (unsigned char* ret, unsigned char* src, uint64_t srcSize); uint64_t Decompress004 (unsigned char* ret, unsigned char* src, uint64_t srcSize); uint64_t Decompress005 (unsigned char* ret, unsigned char* src, uint64_t srcSize); uint64_t Decompress006 (unsigned char* ret, unsigned char* src, uint64_t srcSize); uint64_t Decompress007 (unsigned char* ret, unsigned char* src, uint64_t srcSize); uint64_t Decompress008 (unsigned char* ret, unsigned char* src, uint64_t srcSize); uint64_t Decompress009 (unsigned char* ret, unsigned char* src, uint64_t srcSize); uint64_t Decompress010 (unsigned char* ret, unsigned char* src, uint64_t srcSize); uint64_t Decompress011 (unsigned char* ret, unsigned char* src, uint64_t srcSize); uint64_t Decompress012 (unsigned char* ret, unsigned char* src, uint64_t srcSize); uint64_t Decompress013 (unsigned char* ret, unsigned char* src, uint64_t srcSize); uint64_t Decompress014 (unsigned char* ret, unsigned char* src, uint64_t srcSize); uint64_t Decompress015 (unsigned char* ret, unsigned char* src, uint64_t srcSize); uint64_t Decompress016 (unsigned char* ret, unsigned char* src, uint64_t srcSize); uint64_t Decompress017 (unsigned char* ret, unsigned char* src, uint64_t srcSize); uint64_t Decompress018 (unsigned char* ret, unsigned char* src, uint64_t srcSize); uint64_t Decompress019 (unsigned char* ret, unsigned char* src, uint64_t srcSize); uint64_t Decompress020 (unsigned char* ret, unsigned char* src, uint64_t srcSize); uint64_t Decompress021 (unsigned char* ret, unsigned char* src, uint64_t srcSize); uint64_t Decompress022 (unsigned char* ret, unsigned char* src, uint64_t srcSize); uint64_t Decompress023 (unsigned char* ret, unsigned char* src, uint64_t srcSize); uint64_t Decompress024 (unsigned char* ret, unsigned char* src, uint64_t srcSize); uint64_t Decompress025 (unsigned char* ret, unsigned char* src, uint64_t srcSize); uint64_t Decompress026 (unsigned char* ret, unsigned char* src, uint64_t srcSize); uint64_t Decompress027 (unsigned char* ret, unsigned char* src, uint64_t srcSize); uint64_t Decompress028 (unsigned char* ret, unsigned char* src, uint64_t srcSize); uint64_t Decompress029 (unsigned char* ret, unsigned char* src, uint64_t srcSize); uint64_t Decompress030 (unsigned char* ret, unsigned char* src, uint64_t srcSize); uint64_t Decompress031 (unsigned char* ret, unsigned char* src, uint64_t srcSize); uint64_t Decompress032 (unsigned char* ret, unsigned char* src, uint64_t srcSize); /* void memcpy_AVX_4K_prefetched (void *dst, const void *src, size_t nbytes) { // F3 0F 6F /r RM V/V SSE2 Move unaligned packed integer values from xmm2/m128 to xmm1. // MOVDQU xmm1, xmm2/m128 // F3 0F 7F /r MR V/V SSE2 Move unaligned packed integer values from xmm1 to xmm2/m128. // MOVDQU xmm2/m128, xmm1 // VEX.128.F3.0F.WIG 6F /r RM V/V AVX Move unaligned packed integer values from xmm2/m128 to xmm1. // VMOVDQU xmm1, xmm2/m128 // VEX.128.F3.0F.WIG 7F /r MR V/V AVX Move unaligned packed integer values from xmm1 to xmm2/m128. // VMOVDQU xmm2/m128, xmm1 // VEX.256.F3.0F.WIG 6F /r RM V/V AVX Move unaligned packed integer values from ymm2/m256 to ymm1. // VMOVDQU ymm1, ymm2/m256 // VEX.256.F3.0F.WIG 7F /r MR V/V AVX Move unaligned packed integer values from ymm1 to ymm2/m256. // VMOVDQU ymm2/m256, ymm1 if ( (nbytes&0x3f) == 0 ) { // 64bytes per cycle __asm{ mov rsi, src mov rdi, dst mov rcx, nbytes shr rcx, 6 main_loop: test rcx, rcx ; 'nbytes' may be 0 jz main_loop_end prefetcht0 [rsi+64*64] vmovdqu xmm0, [rsi] vmovdqu xmm1, [rsi+16] vmovdqu xmm2, [rsi+32] vmovdqu xmm3, [rsi+48] vmovdqu [rdi], xmm0 vmovdqu [rdi+16], xmm1 vmovdqu [rdi+32], xmm2 vmovdqu [rdi+48], xmm3 add rsi, 64 add rdi, 64 dec rcx jmp main_loop main_loop_end: sfence } } else memcpy(dst, src, nbytes); } void memcpy_SSE2_4K_prefetched (void *dst, const void *src, size_t nbytes) { // F3 0F 6F /r RM V/V SSE2 Move unaligned packed integer values from xmm2/m128 to xmm1. // MOVDQU xmm1, xmm2/m128 // F3 0F 7F /r MR V/V SSE2 Move unaligned packed integer values from xmm1 to xmm2/m128. // MOVDQU xmm2/m128, xmm1 // VEX.128.F3.0F.WIG 6F /r RM V/V AVX Move unaligned packed integer values from xmm2/m128 to xmm1. // VMOVDQU xmm1, xmm2/m128 // VEX.128.F3.0F.WIG 7F /r MR V/V AVX Move unaligned packed integer values from xmm1 to xmm2/m128. // VMOVDQU xmm2/m128, xmm1 // VEX.256.F3.0F.WIG 6F /r RM V/V AVX Move unaligned packed integer values from ymm2/m256 to ymm1. // VMOVDQU ymm1, ymm2/m256 // VEX.256.F3.0F.WIG 7F /r MR V/V AVX Move unaligned packed integer values from ymm1 to ymm2/m256. // VMOVDQU ymm2/m256, ymm1 if ( (nbytes&0x3f) == 0 ) { // 64bytes per cycle __asm{ mov rsi, src mov rdi, dst mov rcx, nbytes shr rcx, 6 main_loop: test rcx, rcx ; 'nbytes' may be 0 jz main_loop_end prefetcht0 [rsi+64*64] movdqu xmm0, [rsi] movdqu xmm1, [rsi+16] movdqu xmm2, [rsi+32] movdqu xmm3, [rsi+48] movdqu [rdi], xmm0 movdqu [rdi+16], xmm1 movdqu [rdi+32], xmm2 movdqu [rdi+48], xmm3 add rsi, 64 add rdi, 64 dec rcx jmp main_loop main_loop_end: sfence } } else memcpy(dst, src, nbytes); } */ // Min_Match_Length=THRESHOLD=4 means 4 and bigger are to be encoded: #define Min_Match_BAILOUT_Length (8) #define Min_Match_Length (32) #define Min_Match_Length_SHORT (5) #define OffsetBITS (32-3) #define LengthBITS (1) //12bit //#define REF_SIZE (4095+Min_Match_Length) //#define REF_SIZE ( ((1<>20; k=k*Trials; printf("RAM-to-RAM performance: %d MB/s.\n", k); // Warm up done... (void) time(&t1); (void) time(&t3); while (t3 == t1) (void) time(&t3); t1=t3; clocks1 = clock(); #if defined(_icl_mumbo_jumbo_) ticksStart = GetRDTSC(); #endif while (t3 != t1+2) (void) time(&t3); #if defined(_icl_mumbo_jumbo_) ticksTOTAL = ticksTOTAL + GetRDTSC() - ticksStart; #endif printf("Info1: One second seems to have %s clocks.\n", _ui64toaKAZEcomma((clock()-clocks1)/2, llTOaDigits, 10)); #if defined(_icl_mumbo_jumbo_) printf("Info2: This CPU seems to be working at %s MHz.\n", _ui64toaKAZEcomma(ticksTOTAL/2/1000000, llTOaDigits, 10)); #endif // Go... #if defined(_icl_mumbo_jumbo_) ticksStart = GetRDTSC(); #endif TargetSize = Decompress(TargetBlock, SourceBlock, SourceSize); #if defined(_icl_mumbo_jumbo_) ticksTOTAL2 = ticksTOTAL2 + GetRDTSC() - ticksStart; #endif printf("Decompression time: %s ticks.\n", _ui64toaKAZEcomma(ticksTOTAL2, llTOaDigits, 10)); // printf("TPI (Ticks_Per_Instruction_during_branchless_decompression) performance: %.3f\n", (float)(ticksTOTAL/2) / (float)(46*loopcounterfor411) ); // printf("IPC (Instructions_Per_Clock_during_branchless_decompression) performance: %.3f\n\n", (float)(46*loopcounterfor411) / (float)(ticksTOTAL/2) ); // Above two lines are buggy, the fix is below: printf("TPI (Ticks_Per_Instruction_during_branchless_decompression) performance: %.3f\n", (float)(ticksTOTAL2) / (float)(46*loopcounterfor411) ); printf("IPC (Instructions_Per_Clock_during_branchless_decompression) performance: %.3f\n\n", (float)(46*loopcounterfor411) / (float)(ticksTOTAL2) ); printf("; mark_description Intel(R) C++ Intel(R) 64 Compiler XE for applications running on Intel(R) 64, Version 15.0.0.108 Build 20140;\n"); printf("; mark_description -O3 -QxSSE2 -D_N_YMM -D_N_prefetch_4096 -D_icl_mumbo_jumbo_ -FAcs;\n"); printf("\n"); printf(".B14.3:: \n"); printf(" 00030 45 8b 38 mov r15d, DWORD PTR [r8] \n"); printf(" 00033 44 89 f9 mov ecx, r15d \n"); printf(" 00036 83 f1 03 xor ecx, 3 \n"); printf(" 00039 41 bc ff ff ff \n"); printf(" ff mov r12d, -1 \n"); printf(" 0003f c1 e1 03 shl ecx, 3 \n"); printf(" 00042 bd 01 00 00 00 mov ebp, 1 \n"); printf(" 00047 41 d3 ec shr r12d, cl \n"); printf(" 0004a 45 23 fc and r15d, r12d \n"); printf(" 0004d 45 33 e4 xor r12d, r12d \n"); printf(" 00050 45 89 fe mov r14d, r15d \n"); printf(" 00053 45 89 fb mov r11d, r15d \n"); printf(" 00056 41 83 e6 0f and r14d, 15 \n"); printf(" 0005a 48 89 c1 mov rcx, rax \n"); printf(" 0005d 41 83 fe 0c cmp r14d, 12 \n"); printf(" 00061 44 0f 44 e5 cmove r12d, ebp \n"); printf(" 00065 4c 89 c5 mov rbp, r8 \n"); printf(" 00068 41 c1 eb 04 shr r11d, 4 \n"); printf(" 0006c 41 ff cc dec r12d \n"); printf(" 0006f 45 89 da mov r10d, r11d \n"); printf(" 00072 4d 89 e6 mov r14, r12 \n"); printf(" 00075 49 2b ca sub rcx, r10 \n"); printf(" 00078 49 f7 d6 not r14 \n"); printf(" 0007b 48 ff c9 dec rcx \n"); printf(" 0007e 49 23 ee and rbp, r14 \n"); printf(" 00081 49 23 cc and rcx, r12 \n"); printf(" 00084 41 ff c3 inc r11d \n"); printf(" 00087 4d 23 d6 and r10, r14 \n"); printf(" 0008a 4d 23 de and r11, r14 \n"); printf(" 0008d c5 fe 6f 44 29 \n"); printf(" 01 vmovdqu ymm0, YMMWORD PTR [1+rcx+rbp] \n"); printf(" 00093 44 89 fd mov ebp, r15d \n"); printf(" 00096 83 e5 03 and ebp, 3 \n"); printf(" 00099 41 83 e7 0c and r15d, 12 \n"); printf(" 0009d ff c5 inc ebp \n"); printf(" 0009f 41 83 c7 04 add r15d, 4 \n"); printf(" 000a3 89 e9 mov ecx, ebp \n"); printf(" 000a5 c1 e9 02 shr ecx, 2 \n"); printf(" 000a8 41 d3 e7 shl r15d, cl \n"); printf(" 000ab 49 23 ec and rbp, r12 \n"); printf(" 000ae 4d 23 fc and r15, r12 \n"); printf(" 000b1 4c 03 dd add r11, rbp \n"); printf(" 000b4 4d 03 d7 add r10, r15 \n"); printf(" 000b7 4d 03 c3 add r8, r11 \n"); printf(" 000ba c5 fe 7f 00 vmovdqu YMMWORD PTR [rax], ymm0 \n"); printf(" 000be 49 03 c2 add rax, r10 \n"); printf(" 000c1 4d 3b c1 cmp r8, r9 \n"); printf(" 000c4 0f 82 66 ff ff \n"); printf(" ff jb .B14.3 \n"); printf("\n"); printf("Note: Above 46 instructions are executed 29,763,921 times.\n"); SkipTheOldStuff: NumberOfThreadsToPlayWith = 32; SourceFileSize=91964279; printf("Allocating %s bytes...\n", _ui64toaKAZEcomma((uint64_t)SourceFileSize*NumberOfThreadsToPlayWith+512, llTOaDigits, 10)); SourceBlock = (unsigned char*)malloc((uint64_t)SourceFileSize*NumberOfThreadsToPlayWith+512); if( SourceBlock == NULL ) { puts( "\nLexx: Needed memory allocation denied!\n" ); return( 1 ); } TargetFileSize=273401856; TargetSize=273401856; printf("Allocating %s bytes...\n", _ui64toaKAZEcomma((uint64_t)TargetFileSize*NumberOfThreadsToPlayWith+512*NumberOfThreadsToPlayWith, llTOaDigits, 10)); TargetBlock = (unsigned char*)malloc((uint64_t)TargetFileSize*NumberOfThreadsToPlayWith+512*NumberOfThreadsToPlayWith); if( TargetBlock == NULL ) { free(SourceBlock); puts( "\nLexx: Needed memory allocation denied!\n" ); return( 1 ); } printf("Source&Target buffers are allocated.\n"); fread(SourceBlock, 1, SourceFileSize, fp); fclose(fp); printf("Simulating we have %d blocks for decompression...\n", NumberOfThreadsToPlayWith); for (i = 1; i <= (NumberOfThreadsToPlayWith-1); i++) { memcpy(SourceBlock+(uint64_t)i*SourceFileSize, SourceBlock, SourceFileSize); } #ifdef Commence_OpenMP printf("Enforcing %d thread(s).\n", NumberOfThreadsToPlayWith); #else printf("Enforcing 1 thread.\n"); #endif #ifdef Commence_OpenMP printf("omp_get_num_procs( ) = %d\n", omp_get_num_procs( )); printf("omp_get_max_threads( ) = %d\n", omp_get_max_threads( )); #endif #if defined(_icl_mumbo_jumbo_) ticksStart = GetRDTSC(); #endif #if defined(_gcc_mumbo_jumbo_) ticksStart = GetRDTSC(); #endif Trials=64; for (i = 1; i <= Trials; i++) { #ifdef Commence_OpenMP #pragma omp parallel shared(TargetBlock, SourceBlock, TargetFileSize, SourceFileSize) private(TargetSize001,TargetSize002,TargetSize003,TargetSize004,TargetSize005,TargetSize006,TargetSize007,TargetSize008,TargetSize009,TargetSize010,TargetSize011,TargetSize012,TargetSize013,TargetSize014,TargetSize015,TargetSize016,TargetSize017,TargetSize018,TargetSize019,TargetSize020,TargetSize021,TargetSize022,TargetSize023,TargetSize024,TargetSize025,TargetSize026,TargetSize027,TargetSize028,TargetSize029,TargetSize030,TargetSize031,TargetSize032) #endif { #ifdef Commence_OpenMP #pragma omp sections #endif { #ifdef Commence_OpenMP #pragma omp section #endif { // Thread 001: TargetSize001 = Decompress001(TargetBlock+(uint64_t)(1-1)*(TargetFileSize+512), SourceBlock+(uint64_t)(1-1)*SourceFileSize, SourceFileSize); if (TargetFileSize != TargetSize001) { printf("Lexx: Failure! Decompressed size mismatch!\n"); exit(13); } } #ifdef Commence_OpenMP #pragma omp section #endif { // Thread 002: TargetSize002 = Decompress002(TargetBlock+(uint64_t)(2-1)*(TargetFileSize+512), SourceBlock+(uint64_t)(2-1)*SourceFileSize, SourceFileSize); if (TargetFileSize != TargetSize002) { printf("Lexx: Failure! Decompressed size mismatch!\n"); exit(13); } } #ifdef Commence_OpenMP #pragma omp section #endif { // Thread 003: TargetSize003 = Decompress003(TargetBlock+(uint64_t)(3-1)*(TargetFileSize+512), SourceBlock+(uint64_t)(3-1)*SourceFileSize, SourceFileSize); if (TargetFileSize != TargetSize003) { printf("Lexx: Failure! Decompressed size mismatch!\n"); exit(13); } } #ifdef Commence_OpenMP #pragma omp section #endif { // Thread 004: TargetSize004 = Decompress004(TargetBlock+(uint64_t)(4-1)*(TargetFileSize+512), SourceBlock+(uint64_t)(4-1)*SourceFileSize, SourceFileSize); if (TargetFileSize != TargetSize004) { printf("Lexx: Failure! Decompressed size mismatch!\n"); exit(13); } } #ifdef Commence_OpenMP #pragma omp section #endif { // Thread 005: TargetSize005 = Decompress005(TargetBlock+(uint64_t)(5-1)*(TargetFileSize+512), SourceBlock+(uint64_t)(5-1)*SourceFileSize, SourceFileSize); if (TargetFileSize != TargetSize005) { printf("Lexx: Failure! Decompressed size mismatch!\n"); exit(13); } } #ifdef Commence_OpenMP #pragma omp section #endif { // Thread 006: TargetSize006 = Decompress006(TargetBlock+(uint64_t)(6-1)*(TargetFileSize+512), SourceBlock+(uint64_t)(6-1)*SourceFileSize, SourceFileSize); if (TargetFileSize != TargetSize006) { printf("Lexx: Failure! Decompressed size mismatch!\n"); exit(13); } } #ifdef Commence_OpenMP #pragma omp section #endif { // Thread 007: TargetSize007 = Decompress007(TargetBlock+(uint64_t)(7-1)*(TargetFileSize+512), SourceBlock+(uint64_t)(7-1)*SourceFileSize, SourceFileSize); if (TargetFileSize != TargetSize007) { printf("Lexx: Failure! Decompressed size mismatch!\n"); exit(13); } } #ifdef Commence_OpenMP #pragma omp section #endif { // Thread 008: TargetSize008 = Decompress008(TargetBlock+(uint64_t)(8-1)*(TargetFileSize+512), SourceBlock+(uint64_t)(8-1)*SourceFileSize, SourceFileSize); if (TargetFileSize != TargetSize008) { printf("Lexx: Failure! Decompressed size mismatch!\n"); exit(13); } } #ifdef Commence_OpenMP #pragma omp section #endif { // Thread 009: TargetSize009 = Decompress009(TargetBlock+(uint64_t)(9-1)*(TargetFileSize+512), SourceBlock+(uint64_t)(9-1)*SourceFileSize, SourceFileSize); if (TargetFileSize != TargetSize009) { printf("Lexx: Failure! Decompressed size mismatch!\n"); exit(13); } } #ifdef Commence_OpenMP #pragma omp section #endif { // Thread 010: TargetSize010 = Decompress010(TargetBlock+(uint64_t)(10-1)*(TargetFileSize+512), SourceBlock+(uint64_t)(10-1)*SourceFileSize, SourceFileSize); if (TargetFileSize != TargetSize010) { printf("Lexx: Failure! Decompressed size mismatch!\n"); exit(13); } } #ifdef Commence_OpenMP #pragma omp section #endif { // Thread 011: TargetSize011 = Decompress011(TargetBlock+(uint64_t)(11-1)*(TargetFileSize+512), SourceBlock+(uint64_t)(11-1)*SourceFileSize, SourceFileSize); if (TargetFileSize != TargetSize011) { printf("Lexx: Failure! Decompressed size mismatch!\n"); exit(13); } } #ifdef Commence_OpenMP #pragma omp section #endif { // Thread 012: TargetSize012 = Decompress012(TargetBlock+(uint64_t)(12-1)*(TargetFileSize+512), SourceBlock+(uint64_t)(12-1)*SourceFileSize, SourceFileSize); if (TargetFileSize != TargetSize012) { printf("Lexx: Failure! Decompressed size mismatch!\n"); exit(13); } } #ifdef Commence_OpenMP #pragma omp section #endif { // Thread 013: TargetSize013 = Decompress013(TargetBlock+(uint64_t)(13-1)*(TargetFileSize+512), SourceBlock+(uint64_t)(13-1)*SourceFileSize, SourceFileSize); if (TargetFileSize != TargetSize013) { printf("Lexx: Failure! Decompressed size mismatch!\n"); exit(13); } } #ifdef Commence_OpenMP #pragma omp section #endif { // Thread 014: TargetSize014 = Decompress014(TargetBlock+(uint64_t)(14-1)*(TargetFileSize+512), SourceBlock+(uint64_t)(14-1)*SourceFileSize, SourceFileSize); if (TargetFileSize != TargetSize014) { printf("Lexx: Failure! Decompressed size mismatch!\n"); exit(13); } } #ifdef Commence_OpenMP #pragma omp section #endif { // Thread 015: TargetSize015 = Decompress015(TargetBlock+(uint64_t)(15-1)*(TargetFileSize+512), SourceBlock+(uint64_t)(15-1)*SourceFileSize, SourceFileSize); if (TargetFileSize != TargetSize015) { printf("Lexx: Failure! Decompressed size mismatch!\n"); exit(13); } } #ifdef Commence_OpenMP #pragma omp section #endif { // Thread 016: TargetSize016 = Decompress016(TargetBlock+(uint64_t)(16-1)*(TargetFileSize+512), SourceBlock+(uint64_t)(16-1)*SourceFileSize, SourceFileSize); if (TargetFileSize != TargetSize016) { printf("Lexx: Failure! Decompressed size mismatch!\n"); exit(13); } } #ifdef Commence_OpenMP #pragma omp section #endif { // Thread 017: TargetSize017 = Decompress017(TargetBlock+(uint64_t)(17-1)*(TargetFileSize+512), SourceBlock+(uint64_t)(17-1)*SourceFileSize, SourceFileSize); if (TargetFileSize != TargetSize017) { printf("Lexx: Failure! Decompressed size mismatch!\n"); exit(13); } } #ifdef Commence_OpenMP #pragma omp section #endif { // Thread 018: TargetSize018 = Decompress018(TargetBlock+(uint64_t)(18-1)*(TargetFileSize+512), SourceBlock+(uint64_t)(18-1)*SourceFileSize, SourceFileSize); if (TargetFileSize != TargetSize018) { printf("Lexx: Failure! Decompressed size mismatch!\n"); exit(13); } } #ifdef Commence_OpenMP #pragma omp section #endif { // Thread 019: TargetSize019 = Decompress019(TargetBlock+(uint64_t)(19-1)*(TargetFileSize+512), SourceBlock+(uint64_t)(19-1)*SourceFileSize, SourceFileSize); if (TargetFileSize != TargetSize019) { printf("Lexx: Failure! Decompressed size mismatch!\n"); exit(13); } } #ifdef Commence_OpenMP #pragma omp section #endif { // Thread 020: TargetSize020 = Decompress020(TargetBlock+(uint64_t)(20-1)*(TargetFileSize+512), SourceBlock+(uint64_t)(20-1)*SourceFileSize, SourceFileSize); if (TargetFileSize != TargetSize020) { printf("Lexx: Failure! Decompressed size mismatch!\n"); exit(13); } } #ifdef Commence_OpenMP #pragma omp section #endif { // Thread 021: TargetSize021 = Decompress021(TargetBlock+(uint64_t)(21-1)*(TargetFileSize+512), SourceBlock+(uint64_t)(21-1)*SourceFileSize, SourceFileSize); if (TargetFileSize != TargetSize021) { printf("Lexx: Failure! Decompressed size mismatch!\n"); exit(13); } } #ifdef Commence_OpenMP #pragma omp section #endif { // Thread 022: TargetSize022 = Decompress022(TargetBlock+(uint64_t)(22-1)*(TargetFileSize+512), SourceBlock+(uint64_t)(22-1)*SourceFileSize, SourceFileSize); if (TargetFileSize != TargetSize022) { printf("Lexx: Failure! Decompressed size mismatch!\n"); exit(13); } } #ifdef Commence_OpenMP #pragma omp section #endif { // Thread 023: TargetSize023 = Decompress023(TargetBlock+(uint64_t)(23-1)*(TargetFileSize+512), SourceBlock+(uint64_t)(23-1)*SourceFileSize, SourceFileSize); if (TargetFileSize != TargetSize023) { printf("Lexx: Failure! Decompressed size mismatch!\n"); exit(13); } } #ifdef Commence_OpenMP #pragma omp section #endif { // Thread 024: TargetSize024 = Decompress024(TargetBlock+(uint64_t)(24-1)*(TargetFileSize+512), SourceBlock+(uint64_t)(24-1)*SourceFileSize, SourceFileSize); if (TargetFileSize != TargetSize024) { printf("Lexx: Failure! Decompressed size mismatch!\n"); exit(13); } } #ifdef Commence_OpenMP #pragma omp section #endif { // Thread 025: TargetSize025 = Decompress025(TargetBlock+(uint64_t)(25-1)*(TargetFileSize+512), SourceBlock+(uint64_t)(25-1)*SourceFileSize, SourceFileSize); if (TargetFileSize != TargetSize025) { printf("Lexx: Failure! Decompressed size mismatch!\n"); exit(13); } } #ifdef Commence_OpenMP #pragma omp section #endif { // Thread 026: TargetSize026 = Decompress026(TargetBlock+(uint64_t)(26-1)*(TargetFileSize+512), SourceBlock+(uint64_t)(26-1)*SourceFileSize, SourceFileSize); if (TargetFileSize != TargetSize026) { printf("Lexx: Failure! Decompressed size mismatch!\n"); exit(13); } } #ifdef Commence_OpenMP #pragma omp section #endif { // Thread 027: TargetSize027 = Decompress027(TargetBlock+(uint64_t)(27-1)*(TargetFileSize+512), SourceBlock+(uint64_t)(27-1)*SourceFileSize, SourceFileSize); if (TargetFileSize != TargetSize027) { printf("Lexx: Failure! Decompressed size mismatch!\n"); exit(13); } } #ifdef Commence_OpenMP #pragma omp section #endif { // Thread 028: TargetSize028 = Decompress028(TargetBlock+(uint64_t)(28-1)*(TargetFileSize+512), SourceBlock+(uint64_t)(28-1)*SourceFileSize, SourceFileSize); if (TargetFileSize != TargetSize028) { printf("Lexx: Failure! Decompressed size mismatch!\n"); exit(13); } } #ifdef Commence_OpenMP #pragma omp section #endif { // Thread 029: TargetSize029 = Decompress029(TargetBlock+(uint64_t)(29-1)*(TargetFileSize+512), SourceBlock+(uint64_t)(29-1)*SourceFileSize, SourceFileSize); if (TargetFileSize != TargetSize029) { printf("Lexx: Failure! Decompressed size mismatch!\n"); exit(13); } } #ifdef Commence_OpenMP #pragma omp section #endif { // Thread 030: TargetSize030 = Decompress030(TargetBlock+(uint64_t)(30-1)*(TargetFileSize+512), SourceBlock+(uint64_t)(30-1)*SourceFileSize, SourceFileSize); if (TargetFileSize != TargetSize030) { printf("Lexx: Failure! Decompressed size mismatch!\n"); exit(13); } } #ifdef Commence_OpenMP #pragma omp section #endif { // Thread 031: TargetSize031 = Decompress031(TargetBlock+(uint64_t)(31-1)*(TargetFileSize+512), SourceBlock+(uint64_t)(31-1)*SourceFileSize, SourceFileSize); if (TargetFileSize != TargetSize031) { printf("Lexx: Failure! Decompressed size mismatch!\n"); exit(13); } } #ifdef Commence_OpenMP #pragma omp section #endif { // Thread 032: TargetSize032 = Decompress032(TargetBlock+(uint64_t)(32-1)*(TargetFileSize+512), SourceBlock+(uint64_t)(32-1)*SourceFileSize, SourceFileSize); if (TargetFileSize != TargetSize032) { printf("Lexx: Failure! Decompressed size mismatch!\n"); exit(13); } } } }// pragma printf("Pass #%2d of %d\n",i,Trials); } // trials loop #ifdef Commence_OpenMP printf("All threads finished.\n"); #endif #if defined(_icl_mumbo_jumbo_) ticksTOTAL2 = ticksTOTAL2 + GetRDTSC() - ticksStart; #endif #if defined(_gcc_mumbo_jumbo_) ticksTOTAL2 = ticksTOTAL2 + GetRDTSC() - ticksStart; #endif printf("Decompression time: %s ticks.\n", _ui64toaKAZEcomma(ticksTOTAL2, llTOaDigits, 10)); #if defined(_icl_mumbo_jumbo_) printf("TPI (Ticks_Per_Instruction_during_branchless_decompression) performance: %.3f\n", (float)(ticksTOTAL2/(float)Trials) / (float)((float)46*loopcounterfor411*NumberOfThreadsToPlayWith) ); printf("IPC (Instructions_Per_Clock_during_branchless_decompression) performance: %.3f\n\n", (float)((float)46*loopcounterfor411*NumberOfThreadsToPlayWith) / (float)(ticksTOTAL2/(float)Trials) ); #endif #if defined(_gcc_mumbo_jumbo_) printf("TPI (Ticks_Per_Instruction_during_branchless_decompression) performance: %.3f\n", (float)(ticksTOTAL2/(float)Trials) / (float)((float)47*loopcounterfor411*NumberOfThreadsToPlayWith) ); printf("IPC (Instructions_Per_Clock_during_branchless_decompression) performance: %.3f\n\n", (float)((float)47*loopcounterfor411*NumberOfThreadsToPlayWith) / (float)(ticksTOTAL2/(float)Trials) ); #endif strcpy(NewFileName, argv[1]); *( NewFileName + strlen(argv[1])-strlen(Nakamichi) ) = '\0'; } else { SourceBlock = (char*)malloc(SourceSize+512); TargetBlock = (char*)malloc(SourceSize+512+32*1024*1024); //+32*1024*1024, some files may be expanded instead of compressed. fread(SourceBlock, 1, SourceSize, fp); fclose(fp); printf("Compressing %d bytes ...\n", SourceSize ); clocks1 = clock(); while (clocks1 == clock()); clocks1 = clock(); TargetSize = Compress(TargetBlock, SourceBlock, SourceSize); clocks2 = clock(); k = (((float)1000*SourceSize/(clocks2 - clocks1 + 1))); //k=k>>10; printf("RAM-to-RAM performance: %d bytes/s.\n", k); strcpy(NewFileName, argv[1]); strcat(NewFileName, Nakamichi); printf("Compressed to %d bytes.\n", TargetSize ); } if ((fp = fopen(NewFileName, "wb")) == NULL) { printf("Nakamichi: Can't write '%s' file.\n", NewFileName); exit(13); } fwrite(TargetBlock, 1, TargetSize, fp); fclose(fp); if (BandwidthFlag) { // Benchmark memcpy() [ pointerALIGN = TargetBlock + 64 - (((size_t)TargetBlock) % 64); //offset=64-int((long)data&63); printf("Memory pool starting address: %p ... ", pointerALIGN); if (((uintptr_t)(const void *)pointerALIGN & (64 - 1)) == 0) printf( "64 byte aligned, OK\n"); else printf( "NOT 64 byte aligned, FAILURE\n"); clocks3 = clock(); while (clocks3 == clock()); clocks3 = clock(); printf("Copying a %dMB block 1024 times i.e. %dGB READ + %dGB WRITTEN ...\n", 512, 512, 512); for (i = 0; i < 1024; i++) { memcpy(pointerALIGN+512*1024*1024, pointerALIGN, 512*1024*1024); } clocks4 = clock(); duration = (double) (clocks4 - clocks3 + 1); durationGENERIC = duration; printf("memcpy(): (%dMB block); %dMB copied in %d clocks or %.3fMB per clock\n", 512, 1024*( 512 ), (int) duration, (float)1024*( 512 )/ ((int) duration)); /* #ifndef _N_GP clocks3 = clock(); while (clocks3 == clock()); clocks3 = clock(); printf("Copying a %dMB block 1024 times i.e. %dGB READ + %dGB WRITTEN ...\n", 512, 512, 512); for (i = 0; i < 1024; i++) { memcpy_SSE2_4K_prefetched(pointerALIGN+512*1024*1024, pointerALIGN, 512*1024*1024); } clocks4 = clock(); duration = (double) (clocks4 - clocks3 + 1); printf("memcpy_SSE2_4K_prefetched(): (%dMB block); %dMB copied in %d clocks or %.3fMB per clock\n", 512, 1024*( 512 ), (int) duration, (float)1024*( 512 )/ ((int) duration)); #endif #ifdef _N_YMM clocks3 = clock(); while (clocks3 == clock()); clocks3 = clock(); printf("Copying a %dMB block 1024 times i.e. %dGB READ + %dGB WRITTEN ...\n", 512, 512, 512); for (i = 0; i < 1024; i++) { memcpy_AVX_4K_prefetched(pointerALIGN+512*1024*1024, pointerALIGN, 512*1024*1024); } clocks4 = clock(); duration = (double) (clocks4 - clocks3 + 1); printf("memcpy_AVX_4K_prefetched(): (%dMB block); %dMB copied in %d clocks or %.3fMB per clock\n", 512, 1024*( 512 ), (int) duration, (float)1024*( 512 )/ ((int) duration)); #endif */ // Benchmark memcpy() ] //k = (((float)1000*TargetSize/(clocks2 - clocks1 + 1))); k=k>>20; j = (float)1000*1024*( 512 )/ ((int) durationGENERIC); printf("RAM-to-RAM performance vs memcpy() ratio (bigger-the-better): %d%%\n", (int)((float)k*100/j)); } free(TargetBlock); free(SourceBlock); exit(0); } void SearchIntoSlidingWindow(unsigned int* ShortMediumLongOFFSET, unsigned int* retIndex, unsigned int* retMatch, char* refStart,char* refEnd,char* encStart,char* encEnd){ char* FoundAtPosition; unsigned int match=0; // Too lazy to write Railgun-Reverse, it would save many ugly patches... // In order to avoid the unheardof slowness the 512MB may be reduced to 8MB... // --| char* refStartHOT = refEnd-(1024*8*128-1); // | char* refStartHOTTER = refEnd-(256*8*128-1); // | char* refStartHOTEST = refEnd-(4*8*128-1); // | char* refStartHOTultra = refEnd-(16-1); // | char* refStartCOLDERbig = refEnd-(256*1024*8*128-1); // | *retIndex=0; *retMatch=0; *ShortMediumLongOFFSET=0; #ifdef ReplaceBruteForceWithRailgunSwampshineBailOut // Also, finally it is time to fix the stupid offset (blind for files smaller than the current window) stupidity for small files: // Simply assign 'refStart' if it is within the current window i.e. between e.g. 'refEnd-(256*8*128-1)' and 'refEnd': // Nasty bug fixed (pointer getting negative) only here, to be fixed in all the rest variants [ /* if ( refStart >= refEnd-(2048*8*128-1) ) refStartHOT = refStart; //--| if ( refStart >= refEnd-(256*8*128-1) ) refStartHOTTER = refStart; //--| if ( refStart >= refEnd-(2*4*8*128-1) ) refStartHOTEST = refStart; //--| if ( refStart >= refEnd-(512*1024*8*128-1) ) refStartCOLDERbig = refStart; //--| // \ / */ // Nasty bug fixed (pointer getting negative) only here, to be fixed in all the rest variants ] if ( (1024*8*128-1)>= refEnd-refStart ) refStartHOT = refStart; if ( (256*8*128-1)>= refEnd-refStart ) refStartHOTTER = refStart; if ( (4*8*128-1)>= refEnd-refStart ) refStartHOTEST = refStart; if ( (16-1)>= refEnd-refStart ) refStartHOTultra = refStart; if ( (256*1024*8*128-1)>= refEnd-refStart ) refStartCOLDERbig = refStart; //printf("%d\n", refStartCOLDERbig); //debug //printf("%p\n", refStartCOLDERbig); //debug // Chosen order is sizewise: // 12:1 = 12 // 8:1 = 8 // 16:2 = 8 // 12:2 = 6 // 16:3 = 5.3 // 4:1 = 4 // 8:2 = 4 // 12:3 = 4 // 8:3 = 2.6 // 4:2 = 2 // 4:3 = 1.3 // Tengu: // |1stLSB |2ndLSB |3rdLSB | // ------------------------------- // |OO|LL|xxxx|xxxxxxxx|xxxxxx|xx| // ------------------------------- // [1bit 16bit] 24bit] // LL = 00b means Long MatchLength, (4-LL)<<2 or 16 // LL = 01b means Long MatchLength, (4-LL)<<2 or 12 // LL = 10b means Long MatchLength, (4-LL)<<2 or 8 // LL = 11b means Long MatchLength, (4-LL)<<2 or 4 // OO = 00b means Literal // OO = 01b MatchOffset, 0xFFFFFFFF>>OO, 3 bytes long i.e. Sliding Window is 3*8-LL-OO=3*8-4=20 or 1MB // OO = 10b MatchOffset, 0xFFFFFFFF>>OO, 2 bytes long i.e. Sliding Window is 2*8-LL-OO=2*8-4=12 or 4KB // OO = 11b MatchOffset, 0xFFFFFFFF>>OO, 1 byte long i.e. Sliding Window is 1*8-LL-OO=1*8-4=4 or 16B // Chosen order is sizewise: // 12:2 = 6 // 8:2 = 4 // 12:3 = 4 // 12:4 = 3 // 8:3 = 2.6 // 8:4 = 2 // |1stLSB |2ndLSB |3rdLSB | // ------------------------------- // |OO|L|xxxxx|xxxxxxxx|xxxxxx|xx| // ------------------------------- // [1bit 16bit] 24bit] // L = 0b means Long MatchLength, (12-L) or 12 // L = 1b means Short MatchLength, (12-L) or 8 // OO = 00b means Literal // OO = 01b MatchOffset, 0xFFFFFFFF>>(3-OO), 2 bytes long i.e. Sliding Window is 2*8-L-OO=(1+OO)*8-3=13 or 8KB // OO = 10b MatchOffset, 0xFFFFFFFF>>(3-OO), 3 bytes long i.e. Sliding Window is 3*8-L-OO=(1+OO)*8-3=21 or 2MB // OO = 11b MatchOffset, 0xFFFFFFFF>>(3-OO), 4 bytes long i.e. Sliding Window is 4*8-L-OO=(1+OO)*8-3=29 or 512MB // (8-0):2 = 4 // (12-0):3 = 4 // (16-0):4 = 4 // (16-4):4 = 3 // (12-4):3 = 2.6 // (8-4):2 = 2 // |1stLSB |2ndLSB |3rdLSB | // ------------------------------- // |OO|LL|xxxx|xxxxxxxx|xxxxxx|xx| // ------------------------------- // [1bit 16bit] 24bit] // OOLL = 1100b means Literal // LL = 00b means 04 MatchLength, (1+LL)<<2) // LL = 01b means 08 MatchLength, (1+LL)<<2) // LL = 10b means 12 MatchLength, (1+LL)<<2) // LL = 11b means 16 MatchLength, (1+LL)<<2) // OO = 00b MatchOffset, 0xFFFFFFFF>>(3-OO), 1 bytes long i.e. Sliding Window is 1*8-LL-OO=(1+OO)*8-4=04 or 16B // OO = 01b MatchOffset, 0xFFFFFFFF>>(3-OO), 2 bytes long i.e. Sliding Window is 2*8-LL-OO=(1+OO)*8-4=12 or 4KB // OO = 10b MatchOffset, 0xFFFFFFFF>>(3-OO), 3 bytes long i.e. Sliding Window is 3*8-LL-OO=(1+OO)*8-4=20 or 1MB // OO = 11b MatchOffset, 0xFFFFFFFF>>(3-OO), 4 bytes long i.e. Sliding Window is 4*8-LL-OO=(1+OO)*8-4=28 or 256MB // Chosen order is sizewise: // 12:1 = 12 // 8:1 = 8 // 16:2 = 8 // 12:2 = 6 // 16:3 = 5.3 // 4:1 = 4 // 8:2 = 4 // 12:3 = 4 // 16:4 = 4 // 12:4 = 3 // 8:3 = 2.6 // 4:2 = 2 // 8:4 = 2 // 4:3 = 1.3 // |1stLSB |2ndLSB |3rdLSB | // ------------------------------- // |OO|LL|xxxx|xxxxxxxx|xxxxxx|xx| // ------------------------------- // [1bit 16bit] 24bit] // OOLL = 0011 means Literal // OO = 00b MatchOffset, 0xFFFFFFFF>>(3-OO), 1 bytes long i.e. Sliding Window is 1*8-LL-OO=(1+OO)*8-4=04 or 16B // OO = 01b MatchOffset, 0xFFFFFFFF>>(3-OO), 2 bytes long i.e. Sliding Window is 2*8-LL-OO=(1+OO)*8-4=12 or 4KB // OO = 10b MatchOffset, 0xFFFFFFFF>>(3-OO), 3 bytes long i.e. Sliding Window is 3*8-LL-OO=(1+OO)*8-4=20 or 1MB // OO = 11b MatchOffset, 0xFFFFFFFF>>(3-OO), 4 bytes long i.e. Sliding Window is 4*8-LL-OO=(1+OO)*8-4=28 or 256MB // LL = 00b means 04/08/12 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 01b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 10b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 11b means 08/16/24/32 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // (1<<2<<0):1 = 4:1 priority #08 #01 12:1 = 12 // (2<<2<<0):1 = 8:1 priority #02 #02 8:1 = 8 // (3<<2<<0):1 = 12:1 priority #01 #03 16:2 = 8 // (4<<2<<0):1 = 16:1 (not used in 'Hoshimi') #04 32:4 = 8 // (1<<2<<0):2 = 4:2 priority #13 #05 12:2 = 6 // (2<<2<<0):2 = 8:2 priority #09 #06 24:4 = 6 // (3<<2<<0):2 = 12:2 priority #05 #07 16:3 = 5.3 // (4<<2<<0):2 = 16:2 priority #03 #08 4:1 = 4 // (1<<2<<0):3 = 4:3 priority #15 #09 8:2 = 4 // (2<<2<<0):3 = 8:3 priority #12 #10 12:3 = 4 // (3<<2<<0):3 = 12:3 priority #10 #11 16:4 = 4 // (4<<2<<0):3 = 16:3 priority #07 #12 8:3 = 2.6 // (1<<2<<1):4 = 8:4 priority #14 (not used in 'Hoshimi*') #13 4:2 = 2 // (2<<2<<1):4 = 16:4 priority #11 #14 8:4 = 2 // (3<<2<<1):4 = 24:4 priority #06 #15 4:3 = 1.6 // (4<<2<<1):4 = 32:4 priority #04 // In 'Hoshimi' two bit combinations were unexploited, in 'Hoshimikou' one bit combination was unexploited, in 'Lexx' none is left. // 12:1 // Pre-emptive strike, matches should be sought close to the lookahead (cache-friendliness) [ if (refStartHOTultra >= refStart) if (refStartHOTultra < refEnd) { FoundAtPosition = Railgun_Doublet (refStartHOTultra, encStart, (uint32_t)(refEnd-refStartHOTultra), 12); if (FoundAtPosition!=NULL) { *retMatch=12; // The first four bits should be: // 1000b = 8 *retIndex=(((refEnd-FoundAtPosition)<<4)&0xF0)|0x0008; // xx ... x[LLOO] *ShortMediumLongOFFSET=1; return; } } // Pre-emptive strike, matches should be sought close to the lookahead (cache-friendliness) ] // 8:1 // Pre-emptive strike, matches should be sought close to the lookahead (cache-friendliness) [ if (refStartHOTultra >= refStart) if (refStartHOTultra < refEnd) { FoundAtPosition = Railgun_Doublet (refStartHOTultra, encStart, (uint32_t)(refEnd-refStartHOTultra), 8); if (FoundAtPosition!=NULL) { *retMatch=8; // The first four bits should be: // 0100b = 4 *retIndex=(((refEnd-FoundAtPosition)<<4)&0xF0)|0x0004; // xx ... x[LLOO] *ShortMediumLongOFFSET=1; return; } } // Pre-emptive strike, matches should be sought close to the lookahead (cache-friendliness) ] // 16:2 // Pre-emptive strike, matches should be sought close to the lookahead (cache-friendliness) [ if (refStartHOTEST >= refStart) if (refStartHOTEST < refEnd) { FoundAtPosition = Railgun_Swampshine_BailOut(refStartHOTEST, encStart, (uint32_t)(refEnd-refStartHOTEST), 16); if (FoundAtPosition!=NULL) { *retMatch=16; // The first four bits should be: // 1101b = 13 *retIndex=(((refEnd-FoundAtPosition)<<4)&0xFFF0)|0x000D; // xx ... x[LLOO] *ShortMediumLongOFFSET=2; return; } } // Pre-emptive strike, matches should be sought close to the lookahead (cache-friendliness) ] // 32:4 = 4 // Pre-emptive strike, matches should be sought close to the lookahead (cache-friendliness) [ if (refStartHOTTER >= refStart) if (refStartHOTTER < refEnd) { FoundAtPosition = Railgun_Swampshine_BailOut(refStartHOTTER, encStart, (uint32_t)(refEnd-refStartHOTTER), 32); if (FoundAtPosition!=NULL) { *retMatch=32; // The first four bits should be: // 1111b = F *retIndex=(((refEnd-FoundAtPosition)<<4)&0xFFFFFFF0)|0x000F; // xx ... x[LLOO] *ShortMediumLongOFFSET=4; return; } } // Pre-emptive strike, matches should be sought close to the lookahead (cache-friendliness) ] // Pre-emptive strike, matches should be sought close to the lookahead (cache-friendliness) [ if (refStartHOT >= refStart) if (refStartHOT < refEnd) { FoundAtPosition = Railgun_Swampshine_BailOut(refStartHOT, encStart, (uint32_t)(refEnd-refStartHOT), 32); if (FoundAtPosition!=NULL) { *retMatch=32; // The first four bits should be: // 1111b = F *retIndex=(((refEnd-FoundAtPosition)<<4)&0xFFFFFFF0)|0x000F; // xx ... x[LLOO] *ShortMediumLongOFFSET=4; return; } } // Pre-emptive strike, matches should be sought close to the lookahead (cache-friendliness) ] // Pre-emptive strike, matches should be sought close to the lookahead (cache-friendliness) [ if (refStartCOLDERbig >= refStart) if (refStartCOLDERbig < refEnd) { FoundAtPosition = Railgun_Swampshine_BailOut(refStartCOLDERbig, encStart, (uint32_t)(refEnd-refStartCOLDERbig), 32); if (FoundAtPosition!=NULL) { *retMatch=32; // The first four bits should be: // 1111b = F *retIndex=(((refEnd-FoundAtPosition)<<4)&0xFFFFFFF0)|0x000F; // xx ... x[LLOO] *ShortMediumLongOFFSET=4; return; } } // Pre-emptive strike, matches should be sought close to the lookahead (cache-friendliness) ] // 12:2 // Pre-emptive strike, matches should be sought close to the lookahead (cache-friendliness) [ if (refStartHOTEST >= refStart) if (refStartHOTEST < refEnd) { FoundAtPosition = Railgun_Swampshine_BailOut(refStartHOTEST, encStart, (uint32_t)(refEnd-refStartHOTEST), 12); if (FoundAtPosition!=NULL) { *retMatch=12; // The first four bits should be: // 1001b = 9 *retIndex=(((refEnd-FoundAtPosition)<<4)&0xFFF0)|0x0009; // xx ... x[LLOO] *ShortMediumLongOFFSET=2; return; } } // Pre-emptive strike, matches should be sought close to the lookahead (cache-friendliness) ] // 24:4 // Pre-emptive strike, matches should be sought close to the lookahead (cache-friendliness) [ if (refStartHOTTER >= refStart) if (refStartHOTTER < refEnd) { FoundAtPosition = Railgun_Swampshine_BailOut(refStartHOTTER, encStart, (uint32_t)(refEnd-refStartHOTTER), 24); if (FoundAtPosition!=NULL) { *retMatch=24; // The first four bits should be: // 1011b = B *retIndex=(((refEnd-FoundAtPosition)<<4)&0xFFFFFFF0)|0x000B; // xx ... x[LLOO] *ShortMediumLongOFFSET=4; return; } } // Pre-emptive strike, matches should be sought close to the lookahead (cache-friendliness) ] // Pre-emptive strike, matches should be sought close to the lookahead (cache-friendliness) [ if (refStartHOT >= refStart) if (refStartHOT < refEnd) { FoundAtPosition = Railgun_Swampshine_BailOut(refStartHOT, encStart, (uint32_t)(refEnd-refStartHOT), 24); if (FoundAtPosition!=NULL) { *retMatch=24; // The first four bits should be: // 1011b = B *retIndex=(((refEnd-FoundAtPosition)<<4)&0xFFFFFFF0)|0x000B; // xx ... x[LLOO] *ShortMediumLongOFFSET=4; return; } } // Pre-emptive strike, matches should be sought close to the lookahead (cache-friendliness) ] // Pre-emptive strike, matches should be sought close to the lookahead (cache-friendliness) [ if (refStartCOLDERbig >= refStart) if (refStartCOLDERbig < refEnd) { FoundAtPosition = Railgun_Swampshine_BailOut(refStartCOLDERbig, encStart, (uint32_t)(refEnd-refStartCOLDERbig), 24); if (FoundAtPosition!=NULL) { *retMatch=24; // The first four bits should be: // 1011b = B *retIndex=(((refEnd-FoundAtPosition)<<4)&0xFFFFFFF0)|0x000B; // xx ... x[LLOO] *ShortMediumLongOFFSET=4; return; } } // Pre-emptive strike, matches should be sought close to the lookahead (cache-friendliness) ] // 16:3 = 5.3 // Pre-emptive strike, matches should be sought close to the lookahead (cache-friendliness) [ if (refStartHOTTER >= refStart) if (refStartHOTTER < refEnd) { FoundAtPosition = Railgun_Swampshine_BailOut(refStartHOTTER, encStart, (uint32_t)(refEnd-refStartHOTTER), 16); if (FoundAtPosition!=NULL) { *retMatch=16; // The first four bits should be: // 1110b = 14 *retIndex=(((refEnd-FoundAtPosition)<<4)&0xFFFFF0)|0x000E; // xx ... x[LLOO] *ShortMediumLongOFFSET=3; return; } } // Pre-emptive strike, matches should be sought close to the lookahead (cache-friendliness) ] // Pre-emptive strike, matches should be sought close to the lookahead (cache-friendliness) [ if (refStartHOT >= refStart) if (refStartHOT < refEnd) { FoundAtPosition = Railgun_Swampshine_BailOut(refStartHOT, encStart, (uint32_t)(refEnd-refStartHOT), 16); if (FoundAtPosition!=NULL) { *retMatch=16; // The first four bits should be: // 1110b = 14 *retIndex=(((refEnd-FoundAtPosition)<<4)&0xFFFFF0)|0x000E; // xx ... x[LLOO] *ShortMediumLongOFFSET=3; return; } } // Pre-emptive strike, matches should be sought close to the lookahead (cache-friendliness) ] // 4:1 // Pre-emptive strike, matches should be sought close to the lookahead (cache-friendliness) [ if (refStartHOTultra >= refStart) if (refStartHOTultra < refEnd) { FoundAtPosition = Railgun_Doublet (refStartHOTultra, encStart, (uint32_t)(refEnd-refStartHOTultra), 4); if (FoundAtPosition!=NULL) { *retMatch=4; // The first four bits should be: // 0000b = 0 *retIndex=(((refEnd-FoundAtPosition)<<4)&0xF0)|0x0000; // xx ... x[LLOO] *ShortMediumLongOFFSET=1; return; } } // Pre-emptive strike, matches should be sought close to the lookahead (cache-friendliness) ] // 8:2 // Pre-emptive strike, matches should be sought close to the lookahead (cache-friendliness) [ if (refStartHOTEST >= refStart) if (refStartHOTEST < refEnd) { FoundAtPosition = Railgun_Swampshine_BailOut(refStartHOTEST, encStart, (uint32_t)(refEnd-refStartHOTEST), 8); if (FoundAtPosition!=NULL) { *retMatch=8; // The first four bits should be: // 0101b = 5 *retIndex=(((refEnd-FoundAtPosition)<<4)&0xFFF0)|0x0005; // xx ... x[LLOO] *ShortMediumLongOFFSET=2; return; } } // Pre-emptive strike, matches should be sought close to the lookahead (cache-friendliness) ] // 12:3 = 4 // Pre-emptive strike, matches should be sought close to the lookahead (cache-friendliness) [ if (refStartHOTTER >= refStart) if (refStartHOTTER < refEnd) { FoundAtPosition = Railgun_Swampshine_BailOut(refStartHOTTER, encStart, (uint32_t)(refEnd-refStartHOTTER), 12); if (FoundAtPosition!=NULL) { *retMatch=12; // The first four bits should be: // 1010b = 10 *retIndex=(((refEnd-FoundAtPosition)<<4)&0xFFFFF0)|0x000A; // xx ... x[LLOO] *ShortMediumLongOFFSET=3; return; } } // Pre-emptive strike, matches should be sought close to the lookahead (cache-friendliness) ] // Pre-emptive strike, matches should be sought close to the lookahead (cache-friendliness) [ if (refStartHOT >= refStart) if (refStartHOT < refEnd) { FoundAtPosition = Railgun_Swampshine_BailOut(refStartHOT, encStart, (uint32_t)(refEnd-refStartHOT), 12); if (FoundAtPosition!=NULL) { *retMatch=12; // The first four bits should be: // 1010b = 10 *retIndex=(((refEnd-FoundAtPosition)<<4)&0xFFFFF0)|0x000A; // xx ... x[LLOO] *ShortMediumLongOFFSET=3; return; } } // Pre-emptive strike, matches should be sought close to the lookahead (cache-friendliness) ] // 16:4 = 4 // Pre-emptive strike, matches should be sought close to the lookahead (cache-friendliness) [ if (refStartHOTTER >= refStart) if (refStartHOTTER < refEnd) { FoundAtPosition = Railgun_Swampshine_BailOut(refStartHOTTER, encStart, (uint32_t)(refEnd-refStartHOTTER), 16); if (FoundAtPosition!=NULL) { *retMatch=16; // The first four bits should be: // 0111b = 7 *retIndex=(((refEnd-FoundAtPosition)<<4)&0xFFFFFFF0)|0x0007; // xx ... x[LLOO] *ShortMediumLongOFFSET=4; return; } } // Pre-emptive strike, matches should be sought close to the lookahead (cache-friendliness) ] // Pre-emptive strike, matches should be sought close to the lookahead (cache-friendliness) [ if (refStartHOT >= refStart) if (refStartHOT < refEnd) { FoundAtPosition = Railgun_Swampshine_BailOut(refStartHOT, encStart, (uint32_t)(refEnd-refStartHOT), 16); if (FoundAtPosition!=NULL) { *retMatch=16; // The first four bits should be: // 0111b = 7 *retIndex=(((refEnd-FoundAtPosition)<<4)&0xFFFFFFF0)|0x0007; // xx ... x[LLOO] *ShortMediumLongOFFSET=4; return; } } // Pre-emptive strike, matches should be sought close to the lookahead (cache-friendliness) ] // Pre-emptive strike, matches should be sought close to the lookahead (cache-friendliness) [ if (refStartCOLDERbig >= refStart) if (refStartCOLDERbig < refEnd) { FoundAtPosition = Railgun_Swampshine_BailOut(refStartCOLDERbig, encStart, (uint32_t)(refEnd-refStartCOLDERbig), 16); if (FoundAtPosition!=NULL) { *retMatch=16; // The first four bits should be: // 0111b = 7 *retIndex=(((refEnd-FoundAtPosition)<<4)&0xFFFFFFF0)|0x0007; // xx ... x[LLOO] *ShortMediumLongOFFSET=4; return; } } // Pre-emptive strike, matches should be sought close to the lookahead (cache-friendliness) ] // 8:3 = 2.6 // Pre-emptive strike, matches should be sought close to the lookahead (cache-friendliness) [ if (refStartHOTTER >= refStart) if (refStartHOTTER < refEnd) { FoundAtPosition = Railgun_Swampshine_BailOut(refStartHOTTER, encStart, (uint32_t)(refEnd-refStartHOTTER), 8); if (FoundAtPosition!=NULL) { *retMatch=8; // The first four bits should be: // 0110b = 6 *retIndex=(((refEnd-FoundAtPosition)<<4)&0xFFFFF0)|0x0006; // xx ... x[LLOO] *ShortMediumLongOFFSET=3; return; } } // Pre-emptive strike, matches should be sought close to the lookahead (cache-friendliness) ] // Pre-emptive strike, matches should be sought close to the lookahead (cache-friendliness) [ if (refStartHOT >= refStart) if (refStartHOT < refEnd) { FoundAtPosition = Railgun_Swampshine_BailOut(refStartHOT, encStart, (uint32_t)(refEnd-refStartHOT), 8); if (FoundAtPosition!=NULL) { *retMatch=8; // The first four bits should be: // 0110b = 6 *retIndex=(((refEnd-FoundAtPosition)<<4)&0xFFFFF0)|0x0006; // xx ... x[LLOO] *ShortMediumLongOFFSET=3; return; } } // Pre-emptive strike, matches should be sought close to the lookahead (cache-friendliness) ] // 4:2 // Pre-emptive strike, matches should be sought close to the lookahead (cache-friendliness) [ if (refStartHOTEST >= refStart) if (refStartHOTEST < refEnd) { FoundAtPosition = Railgun_Swampshine_BailOut(refStartHOTEST, encStart, (uint32_t)(refEnd-refStartHOTEST), 4); if (FoundAtPosition!=NULL) { *retMatch=4; // The first four bits should be: // 0001b = 1 *retIndex=(((refEnd-FoundAtPosition)<<4)&0xFFF0)|0x0001; // xx ... x[LLOO] *ShortMediumLongOFFSET=2; return; } } // Pre-emptive strike, matches should be sought close to the lookahead (cache-friendliness) ] // 8:4 = 2 // Pre-emptive strike, matches should be sought close to the lookahead (cache-friendliness) [ if (refStartHOTTER >= refStart) if (refStartHOTTER < refEnd) { FoundAtPosition = Railgun_Swampshine_BailOut(refStartHOTTER, encStart, (uint32_t)(refEnd-refStartHOTTER), 8); if (FoundAtPosition!=NULL) { *retMatch=8; // The first four bits should be: // 0011b = 3 *retIndex=(((refEnd-FoundAtPosition)<<4)&0xFFFFFFF0)|0x0003; // xx ... x[LLOO] *ShortMediumLongOFFSET=4; return; } } // Pre-emptive strike, matches should be sought close to the lookahead (cache-friendliness) ] // Pre-emptive strike, matches should be sought close to the lookahead (cache-friendliness) [ if (refStartHOT >= refStart) if (refStartHOT < refEnd) { FoundAtPosition = Railgun_Swampshine_BailOut(refStartHOT, encStart, (uint32_t)(refEnd-refStartHOT), 8); if (FoundAtPosition!=NULL) { *retMatch=8; // The first four bits should be: // 0011b = 3 *retIndex=(((refEnd-FoundAtPosition)<<4)&0xFFFFFFF0)|0x0003; // xx ... x[LLOO] *ShortMediumLongOFFSET=4; return; } } // Pre-emptive strike, matches should be sought close to the lookahead (cache-friendliness) ] // Pre-emptive strike, matches should be sought close to the lookahead (cache-friendliness) [ if (refStartCOLDERbig >= refStart) if (refStartCOLDERbig < refEnd) { FoundAtPosition = Railgun_Swampshine_BailOut(refStartCOLDERbig, encStart, (uint32_t)(refEnd-refStartCOLDERbig), 8); if (FoundAtPosition!=NULL) { *retMatch=8; // The first four bits should be: // 0011b = 3 *retIndex=(((refEnd-FoundAtPosition)<<4)&0xFFFFFFF0)|0x0003; // xx ... x[LLOO] *ShortMediumLongOFFSET=4; return; } } // Pre-emptive strike, matches should be sought close to the lookahead (cache-friendliness) ] // 4:3 = 1.3 // Pre-emptive strike, matches should be sought close to the lookahead (cache-friendliness) [ if (refStartHOTTER >= refStart) if (refStartHOTTER < refEnd) { FoundAtPosition = Railgun_Swampshine_BailOut(refStartHOTTER, encStart, (uint32_t)(refEnd-refStartHOTTER), 4); if (FoundAtPosition!=NULL) { *retMatch=4; // The first four bits should be: // 0010b = 2 *retIndex=(((refEnd-FoundAtPosition)<<4)&0xFFFFF0)|0x0002; // xx ... x[LLOO] *ShortMediumLongOFFSET=3; return; } } // Pre-emptive strike, matches should be sought close to the lookahead (cache-friendliness) ] // Pre-emptive strike, matches should be sought close to the lookahead (cache-friendliness) [ if (refStartHOT >= refStart) if (refStartHOT < refEnd) { FoundAtPosition = Railgun_Swampshine_BailOut(refStartHOT, encStart, (uint32_t)(refEnd-refStartHOT), 4); if (FoundAtPosition!=NULL) { *retMatch=4; // The first four bits should be: // 0010b = 2 *retIndex=(((refEnd-FoundAtPosition)<<4)&0xFFFFF0)|0x0002; // xx ... x[LLOO] *ShortMediumLongOFFSET=3; return; } } // Pre-emptive strike, matches should be sought close to the lookahead (cache-friendliness) ] #else while(refStart < refEnd){ match=SlidingWindowVsLookAheadBuffer(refStart,refEnd,encStart,encEnd); if(match > *retMatch){ *retMatch=match; *retIndex=refEnd-refStart; } if(*retMatch >= Min_Match_BAILOUT_Length) break; refStart++; } #endif } unsigned int SlidingWindowVsLookAheadBuffer( char* refStart, char* refEnd, char* encStart,char* encEnd){ int ret = 0; while(refStart[ret] == encStart[ret]){ if(&refStart[ret] >= refEnd) break; if(&encStart[ret] >= encEnd) break; ret++; if(ret >= Min_Match_BAILOUT_Length) break; } return ret; } unsigned int Compress(char* ret, char* src, unsigned int srcSize){ unsigned int srcIndex=0; unsigned int retIndex=0; unsigned int index=0; unsigned int match=0; unsigned int notMatch=0; unsigned char* notMatchStart=NULL; char* refStart=NULL; char* encEnd=NULL; int Melnitchka=0; char *Auberge[4] = {"|\0","/\0","-\0","\\\0"}; int ProgressIndicator; unsigned int NumberOfFullLiterals=0; int GLOBALwindowmatchT1=0; int GLOBALwindowmatchT2=0; int GLOBALwindowmatchT3=0; int GLOBALwindowmatchT4=0; int GLOBALwindowmatchS1=0; int GLOBALwindowmatchS2=0; int GLOBALwindowmatchS3=0; int GLOBALwindowmatchS4=0; int GLOBALwindowmatchM1=0; int GLOBALwindowmatchM2=0; int GLOBALwindowmatchM3=0; int GLOBALwindowmatchM4=0; int GLOBALwindowmatchL1=0; int GLOBALwindowmatchL2=0; int GLOBALwindowmatchL3=0; int GLOBALwindowmatchL4=0; unsigned int ShortMediumLongOFFSET=0; while(srcIndex < srcSize){ if(srcIndex>=REF_SIZE) refStart=&src[srcIndex-REF_SIZE]; else refStart=src; if(srcIndex>=srcSize-ENC_SIZE) encEnd=&src[srcSize]; else encEnd=&src[srcIndex+ENC_SIZE]; // Fixing the stupid 'search-beyond-end' bug: if(srcIndex+ENC_SIZE < srcSize) { SearchIntoSlidingWindow(&ShortMediumLongOFFSET,&index,&match,refStart,&src[srcIndex],&src[srcIndex],encEnd); if ( ShortMediumLongOFFSET==1 && match==4 ) GLOBALwindowmatchT1++; if ( ShortMediumLongOFFSET==1 && match==8 ) GLOBALwindowmatchT2++; if ( ShortMediumLongOFFSET==1 && match==12 ) GLOBALwindowmatchT3++; if ( ShortMediumLongOFFSET==1 && match==16 ) GLOBALwindowmatchT4++; if ( ShortMediumLongOFFSET==2 && match==4 ) GLOBALwindowmatchS1++; if ( ShortMediumLongOFFSET==2 && match==8 ) GLOBALwindowmatchS2++; if ( ShortMediumLongOFFSET==2 && match==12 ) GLOBALwindowmatchS3++; if ( ShortMediumLongOFFSET==2 && match==16 ) GLOBALwindowmatchS4++; if ( ShortMediumLongOFFSET==3 && match==4 ) GLOBALwindowmatchM1++; if ( ShortMediumLongOFFSET==3 && match==8 ) GLOBALwindowmatchM2++; if ( ShortMediumLongOFFSET==3 && match==12 ) GLOBALwindowmatchM3++; if ( ShortMediumLongOFFSET==3 && match==16 ) GLOBALwindowmatchM4++; if ( ShortMediumLongOFFSET==4 && match==4*2 ) GLOBALwindowmatchL1++; if ( ShortMediumLongOFFSET==4 && match==8*2 ) GLOBALwindowmatchL2++; if ( ShortMediumLongOFFSET==4 && match==12*2 ) GLOBALwindowmatchL3++; if ( ShortMediumLongOFFSET==4 && match==16*2 ) GLOBALwindowmatchL4++; } else match=0; // Nothing to find. //if ( match srcIndex % (1<<16)) { ProgressIndicator = (int)( (srcIndex+1)*(float)100/(srcSize+1) ); printf("%s; Each rotation means 64KB are encoded; Done %d%%\r", Auberge[Melnitchka++], ProgressIndicator ); Melnitchka = Melnitchka & 3; // 0 1 2 3: 00 01 10 11 } } else { if(notMatch > 0){ *notMatchStart=(unsigned char)((notMatch)<<(4-0)); *notMatchStart=(unsigned char)((notMatch)<<(4-0)) | 0x0C; // Entag it as Literal notMatch=0; } // ---------------------| // \ / //ret[retIndex] = 0x80; // Assuming seventh/fifteenth bit is zero i.e. LONG MATCH i.e. Min_Match_BAILOUT_Length*4 //if ( match==Min_Match_BAILOUT_Length ) ret[retIndex] = 0xC0; // 8bit&7bit set, SHORT MATCH if seventh/fifteenth bit is not zero i.e. Min_Match_BAILOUT_Length // / \ // ---------------------| /* ret[retIndex] = 0x01; // Assuming seventh/fifteenth bit is zero i.e. LONG MATCH i.e. Min_Match_BAILOUT_Length*4 if ( match==Min_Match_BAILOUT_Length ) ret[retIndex] = 0x03; // 2bit&1bit set, LONG MATCH if 2bit is not zero i.e. Min_Match_BAILOUT_Length */ // No need of above, during compression we demanded lowest 2bits to be not 00. // 1bit+3bits+12bits: //ret[retIndex] = ret[retIndex] | ((match-Min_Match_Length)<<4); //ret[retIndex] = ret[retIndex] | (((index-Min_Match_Length) & 0x0F00)>>8); // 1bit+1bit+14bits: //ret[retIndex] = ret[retIndex] | ((match-Min_Match_Length)<<(8-(LengthBITS+1))); // No need to set the matchlength // The fragment below is outrageously ineffective - instead of 8bit&7bit I have to use the lower TWO bits i.e. 2bit&1bit as flags, thus in decompressing one WORD can be fetched instead of two BYTE loads followed by SHR by 2. // ---------------------| // \ / //ret[retIndex] = ret[retIndex] | (((index-Min_Match_Length) & 0x3F00)>>8); // 2+4+8=14 //retIndex++; //ret[retIndex] = (char)((index-Min_Match_Length) & 0x00FF); //retIndex++; // / \ // ---------------------| // Now the situation is like LOW:HIGH i.e. FF:3F i.e. 0x3FFF, 16bit&15bit used as flags, // should become LOW:HIGH i.e. FC:FF i.e. 0xFFFC, 2bit&1bit used as flags. /* ret[retIndex] = ret[retIndex] | (((index-Min_Match_Length) & 0x00FF)<<2); // 6+8=14 //ret[retIndex] = ret[retIndex] | (((index-Min_Match_Length) & 0x00FF)<<1); // 7+8=15 retIndex++; ret[retIndex] = (char)(((index-Min_Match_Length) & 0x3FFF)>>6); //ret[retIndex] = (char)(((index-Min_Match_Length) & 0x7FFF)>>7); retIndex++; */ // No need of above, during compression we demanded lowest 2bits to be not 00, use the full 16bits and get rid of the stupid '+/-' Min_Match_Length. //if (index>0xFFFF) {printf ("\nFatal error: Overflow!\n"); exit(13);} //memcpy(&ret[retIndex],&index,2+1); // copy lower 2 bytes //retIndex++; //retIndex++; //retIndex++; memcpy(&ret[retIndex],&index,ShortMediumLongOFFSET); retIndex = retIndex + ShortMediumLongOFFSET; // / \ // ---------------------| srcIndex+=match; if ((srcIndex-match) % (1<<16) > srcIndex % (1<<16)) { ProgressIndicator = (int)( (srcIndex+1)*(float)100/(srcSize+1) ); printf("%s; Each rotation means 64KB are encoded; Done %d%%\r", Auberge[Melnitchka++], ProgressIndicator ); Melnitchka = Melnitchka & 3; // 0 1 2 3: 00 01 10 11 } } } if(notMatch > 0){ *notMatchStart=(unsigned char)((notMatch)<<(4-0)); *notMatchStart=(unsigned char)((notMatch)<<(4-0)) | 0x0C; // Entag it as Literal } printf("%s; Each rotation means 64KB are encoded; Done %d%%\n", Auberge[Melnitchka], 100 ); printf("NumberOfFullLiterals (lower-the-better): %d\n", NumberOfFullLiterals ); printf("Legend: MatchLengths: 4|8=Tiny, 8|16=Short, 12|24=Medium, 16|32=Long; WindowSizes: 1/2/3/4=Tiny/Short/Medium/Long\n"); printf("NumberOf(Tiny )Matches,[Tiny ]Window (%d)[%d]: %d\n", 4, 1, GLOBALwindowmatchT1); printf("NumberOf(Short )Matches,[Tiny ]Window (%d)[%d]: %d\n", 8, 1, GLOBALwindowmatchT2); printf("NumberOf(Medium)Matches,[Tiny ]Window (%d)[%d]: %d\n", 12, 1, GLOBALwindowmatchT3); printf("NumberOf(Tiny )Matches,[Short ]Window (%d)[%d]: %d\n", 4, 2, GLOBALwindowmatchS1); printf("NumberOf(Short )Matches,[Short ]Window (%d)[%d]: %d\n", 8, 2, GLOBALwindowmatchS2); printf("NumberOf(Medium)Matches,[Short ]Window (%d)[%d]: %d\n", 12, 2, GLOBALwindowmatchS3); printf("NumberOf(Long )Matches,[Short ]Window (%d)[%d]: %d\n", 16, 2, GLOBALwindowmatchS4); printf("NumberOf(Tiny )Matches,[Medium]Window (%d)[%d]: %d\n", 4, 3, GLOBALwindowmatchM1); printf("NumberOf(Short )Matches,[Medium]Window (%d)[%d]: %d\n", 8, 3, GLOBALwindowmatchM2); printf("NumberOf(Medium)Matches,[Medium]Window (%d)[%d]: %d\n", 12, 3, GLOBALwindowmatchM3); printf("NumberOf(Long )Matches,[Medium]Window (%d)[%d]: %d\n", 16, 3, GLOBALwindowmatchM4); printf("NumberOf(Tiny )Matches,[Long ]Window (%d)[%d]: %d\n", 2*4, 4, GLOBALwindowmatchL1); printf("NumberOf(Short )Matches,[Long ]Window (%d)[%d]: %d\n", 2*8, 4, GLOBALwindowmatchL2); printf("NumberOf(Medium)Matches,[Long ]Window (%d)[%d]: %d\n", 2*12, 4, GLOBALwindowmatchL3); printf("NumberOf(Long )Matches,[Long ]Window (%d)[%d]: %d\n", 2*16, 4, GLOBALwindowmatchL4); return retIndex; } uint64_t Decompress (unsigned char* ret, unsigned char* src, uint64_t srcSize) { unsigned char* retLOCAL = ret; unsigned char* srcLOCAL = src; unsigned char* srcEndLOCAL = src+srcSize; unsigned int DWORDtrio; //unsigned int Flag; // This line made my hair white! For some reason it should be 64bit, otherwise it makes 'FlagMASKnegated' 32bit during 'FlagMASKnegated= Flag - 1; // -1|0' uint64_t Flag; uint64_t FlagMASK; //= 0xFFFFFFFFFFFFFFFF; uint64_t FlagMASKnegated; //=0x0000000000000000; //int loopcounter=0; while (srcLOCAL < srcEndLOCAL) { //loopcounter++; DWORDtrio = *(unsigned int*)srcLOCAL; //#ifndef _N_GP //#ifdef _N_prefetch_4096 // _mm_prefetch((char*)(srcLOCAL + 64*64), _MM_HINT_T0); //#endif //#endif // |1stLSB |2ndLSB |3rdLSB | // ------------------------------- // |OO|LL|xxxx|xxxxxxxx|xxxxxx|xx| // ------------------------------- // [1bit 16bit] 24bit] // OOLL = 0011 means Literal // OO = 00b MatchOffset, 0xFFFFFFFF>>(3-OO), 1 bytes long i.e. Sliding Window is 1*8-LL-OO=(1+OO)*8-4=04 or 16B // OO = 01b MatchOffset, 0xFFFFFFFF>>(3-OO), 2 bytes long i.e. Sliding Window is 2*8-LL-OO=(1+OO)*8-4=12 or 4KB // OO = 10b MatchOffset, 0xFFFFFFFF>>(3-OO), 3 bytes long i.e. Sliding Window is 3*8-LL-OO=(1+OO)*8-4=20 or 1MB // OO = 11b MatchOffset, 0xFFFFFFFF>>(3-OO), 4 bytes long i.e. Sliding Window is 4*8-LL-OO=(1+OO)*8-4=28 or 256MB // LL = 00b means 04/08/12 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 01b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 10b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 11b means 08/16/24/32 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // (1<<2<<0):1 = 4:1 priority #08 #01 12:1 = 12 // (2<<2<<0):1 = 8:1 priority #02 #02 8:1 = 8 // (3<<2<<0):1 = 12:1 priority #01 #03 16:2 = 8 // (4<<2<<0):1 = 16:1 (not used in 'Hoshimi') #04 32:4 = 8 // (1<<2<<0):2 = 4:2 priority #13 #05 12:2 = 6 // (2<<2<<0):2 = 8:2 priority #09 #06 24:4 = 6 // (3<<2<<0):2 = 12:2 priority #05 #07 16:3 = 5.3 // (4<<2<<0):2 = 16:2 priority #03 #08 4:1 = 4 // (1<<2<<0):3 = 4:3 priority #15 #09 8:2 = 4 // (2<<2<<0):3 = 8:3 priority #12 #10 12:3 = 4 // (3<<2<<0):3 = 12:3 priority #10 #11 16:4 = 4 // (4<<2<<0):3 = 16:3 priority #07 #12 8:3 = 2.6 // (1<<2<<1):4 = 8:4 priority #14 (not used in 'Hoshimi*') #13 4:2 = 2 // (2<<2<<1):4 = 16:4 priority #11 #14 8:4 = 2 // (3<<2<<1):4 = 24:4 priority #06 #15 4:3 = 1.6 // (4<<2<<1):4 = 32:4 priority #04 // In 'Hoshimi' two bit combinations were unexploited, in 'Hoshimikou' one bit combination was unexploited, in 'Lexx' none is left. /* // Branchfull [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); if ( (DWORDtrio & 0x0F) == 0x0C ) { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(srcLOCAL+1) ), 16*2); // Hard lesson: XMM and YMM are not to be used together. #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(srcLOCAL+1) ), retLOCAL ); #endif retLOCAL+= (DWORDtrio>>4); srcLOCAL+= (DWORDtrio>>4)+1; } else { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), 16*2); #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), retLOCAL ); #endif srcLOCAL+= 1+(DWORDtrio&0x03); // 4|3|2|1 retLOCAL+= ((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2); // 4/8/12/16/24/32 } // Branchfull ] */ // Branchless [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); Flag=!((DWORDtrio & 0x0F)-0x0C); // In here Flag=0|1 FlagMASKnegated= Flag - 1; // -1|0 FlagMASK= ~FlagMASKnegated; #ifdef _N_YMM // SlowCopy256bit( (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), retLOCAL); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: SlowCopy256bit( (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), retLOCAL); #endif #ifdef _N_GP // memcpy(retLOCAL, (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), 16*2); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: memcpy(retLOCAL, (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), 16*2); #endif retLOCAL+= ((uint64_t)((DWORDtrio>>4))&FlagMASK) + ((uint64_t)(((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2))&FlagMASKnegated) ; srcLOCAL+= ((uint64_t)((DWORDtrio>>4)+1)&FlagMASK) + ((uint64_t)(1+(DWORDtrio&0x03))&FlagMASKnegated) ; // Branchless ] } //printf("\nloopcounter=%d\n",loopcounter); // loopcounter=29763921 for 'Autobiography_411-ebooks_Collection.tar.Lexx.Nakamichi' return (uint64_t)(retLOCAL - ret); } /* ; 'Oniyanma-Monsterdragonfly-Lexx_branchfull' decompression loop, 7d-14+2=107 bytes long: ; mark_description "Intel(R) C++ Intel(R) 64 Compiler XE for applications running on Intel(R) 64, Version 15.0.0.108 Build 20140"; ; mark_description "-O3 -QxSSE2 -D_N_YMM -D_N_prefetch_4096 -FAcs"; .B8.3:: 00014 41 8b 02 mov eax, DWORD PTR [r10] 00017 89 c1 mov ecx, eax 00019 83 f1 03 xor ecx, 3 0001c 41 bb ff ff ff ff mov r11d, -1 00022 c1 e1 03 shl ecx, 3 00025 41 d3 eb shr r11d, cl 00028 41 23 c3 and eax, r11d 0002b 89 c1 mov ecx, eax 0002d 83 e1 0f and ecx, 15 00030 83 f9 0c cmp ecx, 12 00033 75 17 jne .B8.5 .B8.4:: 00035 c4 c1 7e 6f 42 01 vmovdqu ymm0, YMMWORD PTR [1+r10] 0003b c1 e8 04 shr eax, 4 0003e c5 fe 7f 02 vmovdqu YMMWORD PTR [rdx], ymm0 00042 48 03 d0 add rdx, rax 00045 ff c0 inc eax 00047 4c 03 d0 add r10, rax 0004a eb 2e jmp .B8.6 .B8.5:: 0004c 41 89 c3 mov r11d, eax 0004f 89 c1 mov ecx, eax 00051 41 c1 eb 04 shr r11d, 4 00055 83 e1 03 and ecx, 3 00058 ff c1 inc ecx 0005a 49 f7 db neg r11 0005d 83 e0 0c and eax, 12 00060 4c 03 da add r11, rdx 00063 83 c0 04 add eax, 4 00066 4c 03 d1 add r10, rcx 00069 c1 e9 02 shr ecx, 2 0006c c4 c1 7e 6f 03 vmovdqu ymm0, YMMWORD PTR [r11] 00071 d3 e0 shl eax, cl 00073 c5 fe 7f 02 vmovdqu YMMWORD PTR [rdx], ymm0 00077 48 03 d0 add rdx, rax .B8.6:: 0007a 4d 3b d0 cmp r10, r8 0007d 72 95 jb .B8.3 */ /* ; 'Oniyanma-Monsterdragonfly-Lexx_branchless' decompression loop, c2-2d+6=155 bytes long: ; mark_description "Intel(R) C++ Intel(R) 64 Compiler XE for applications running on Intel(R) 64, Version 15.0.0.108 Build 20140"; ; mark_description "-O3 -QxSSE2 -D_N_YMM -D_N_prefetch_4096 -FAcs"; .B8.3:: 0002d 44 8b 3a mov r15d, DWORD PTR [rdx] 00030 44 89 f9 mov ecx, r15d 00033 83 f1 03 xor ecx, 3 00036 41 bc ff ff ff ff mov r12d, -1 0003c c1 e1 03 shl ecx, 3 0003f bd 01 00 00 00 mov ebp, 1 00044 41 d3 ec shr r12d, cl 00047 45 23 fc and r15d, r12d 0004a 45 33 e4 xor r12d, r12d 0004d 45 89 fe mov r14d, r15d 00050 45 89 fb mov r11d, r15d 00053 41 83 e6 0f and r14d, 15 00057 4c 89 c9 mov rcx, r9 0005a 41 83 fe 0c cmp r14d, 12 0005e 44 0f 44 e5 cmove r12d, ebp 00062 48 89 d5 mov rbp, rdx 00065 41 c1 eb 04 shr r11d, 4 00069 41 ff cc dec r12d 0006c 45 89 da mov r10d, r11d 0006f 4d 89 e6 mov r14, r12 00072 49 2b ca sub rcx, r10 00075 49 f7 d6 not r14 00078 48 ff c9 dec rcx 0007b 49 23 ee and rbp, r14 0007e 49 23 cc and rcx, r12 00081 41 ff c3 inc r11d 00084 4d 23 d6 and r10, r14 00087 4d 23 de and r11, r14 0008a c5 fe 6f 44 29 01 vmovdqu ymm0, YMMWORD PTR [1+rcx+rbp] 00090 44 89 fd mov ebp, r15d 00093 83 e5 03 and ebp, 3 00096 41 83 e7 0c and r15d, 12 0009a ff c5 inc ebp 0009c 41 83 c7 04 add r15d, 4 000a0 89 e9 mov ecx, ebp 000a2 c1 e9 02 shr ecx, 2 000a5 41 d3 e7 shl r15d, cl 000a8 49 23 ec and rbp, r12 000ab 4d 23 fc and r15, r12 000ae 4c 03 dd add r11, rbp 000b1 4d 03 d7 add r10, r15 000b4 49 03 d3 add rdx, r11 000b7 c4 c1 7e 7f 01 vmovdqu YMMWORD PTR [r9], ymm0 000bc 4d 03 ca add r9, r10 000bf 49 3b d0 cmp rdx, r8 000c2 0f 82 65 ff ff ff jb .B8.3 */ /* ; 'Tengu' decompression loop, 70-11+2=97 bytes long: ; mark_description "Intel(R) C++ Intel(R) 64 Compiler XE for applications running on Intel(R) 64, Version 12.1.1.258 Build 20111"; ; mark_description "-O3 -QxSSE2 -D_N_XMM -FAcs"; .B7.3:: 00011 8b 02 mov eax, DWORD PTR [rdx] 00013 89 c1 mov ecx, eax 00015 83 e1 03 and ecx, 3 00018 75 1a jne .B7.5 .B7.4:: 0001a 0f b6 c0 movzx eax, al 0001d f3 0f 6f 42 01 movdqu xmm0, XMMWORD PTR [1+rdx] 00022 c1 e8 04 shr eax, 4 00025 f3 41 0f 7f 01 movdqu XMMWORD PTR [r9], xmm0 0002a 4c 03 c8 add r9, rax 0002d ff c0 inc eax 0002f 48 03 d0 add rdx, rax 00032 eb 39 jmp .B7.6 .B7.5:: 00034 c1 e1 03 shl ecx, 3 00037 41 bb ff ff ff ff mov r11d, -1 0003d 41 d3 eb shr r11d, cl 00040 44 23 d8 and r11d, eax 00043 83 e0 0c and eax, 12 00046 41 c1 eb 04 shr r11d, 4 0004a f7 d8 neg eax 0004c 83 c0 10 add eax, 16 0004f 49 f7 db neg r11 00052 4d 03 d9 add r11, r9 00055 c1 e9 03 shr ecx, 3 00058 f7 d9 neg ecx 0005a 83 c1 04 add ecx, 4 0005d f3 41 0f 6f 03 movdqu xmm0, XMMWORD PTR [r11] 00062 f3 41 0f 7f 01 movdqu XMMWORD PTR [r9], xmm0 00067 4c 03 c8 add r9, rax 0006a 48 03 d1 add rdx, rcx .B7.6:: 0006d 49 3b d0 cmp rdx, r8 00070 72 9f jb .B7.3 */ // Results on Core2 T7500 2200MHz: /* D:\Nakamichi_Rakka>Nakamichi_Rakka_YMMless.exe enwik8.Nakamichi /report Nakamichi 'Rakka', written by Kaze, based on Nobuo Ito's LZSS source, babealicious suggestion by m^2 enforced, muffinesque suggestion by Jim Dempsey enforced. Decompressing 41699410 bytes ... RAM-to-RAM performance: 448 MB/s. Memory pool starting address: 0000000002C40080 ... 64 byte aligned, OK Copying a 256MB block 1024 times i.e. 256GB READ + 256GB WRITTEN ... memcpy(): (256MB block); 262144MB copied in 139621 clocks or 1.878MB per clock RAM-to-RAM performance vs memcpy() ratio (bigger-the-better): 23% D:\Nakamichi_Rakka>Nakamichi_Rakka_YMMless.exe OSHO.TXT.Nakamichi /report Nakamichi 'Rakka', written by Kaze, based on Nobuo Ito's LZSS source, babealicious suggestion by m^2 enforced, muffinesque suggestion by Jim Dempsey enforced. Decompressing 66989899 bytes ... RAM-to-RAM performance: 576 MB/s. Memory pool starting address: 0000000004450080 ... 64 byte aligned, OK Copying a 256MB block 1024 times i.e. 256GB READ + 256GB WRITTEN ... memcpy(): (256MB block); 262144MB copied in 139559 clocks or 1.878MB per clock RAM-to-RAM performance vs memcpy() ratio (bigger-the-better): 30% D:\Nakamichi_Rakka>Nakamichi_Rakka_YMMless.exe silesia.tar Nakamichi 'Rakka', written by Kaze, based on Nobuo Ito's LZSS source, babealicious suggestion by m^2 enforced, muffinesque suggestion by Jim Dempsey enforced. Compressing 211948544 bytes ... -; Each rotation means 64KB are encoded; Done 100% NumberOfFullLiterals (lower-the-better): 457929 NumberOf(Tiny)Matches[Tiny]Window (4): 501239 NumberOf(Short)Matches[Tiny]Window (8): 175044 NumberOf(Short)Matches[Long]Window (8): 436335 NumberOf(Medium)Matches[Long]Window (16): 185722 NumberOf(Long)Matches[Long]Window (32): 113643 RAM-to-RAM performance: 2 KB/s. 05/16/2014 08:22 AM 206,908,949 OSHO.TXT 09/15/2014 08:18 AM 66,989,899 OSHO.TXT.Rakka.Nakamichi 05/16/2014 08:22 AM 211,948,544 silesia.tar 09/16/2014 08:58 PM 80,315,189 silesia.tar.Rakka.Nakamichi D:\Nakamichi_Rakka>Nakamichi_Rakka_YMMless.exe silesia.tar.Nakamichi /report Nakamichi 'Rakka', written by Kaze, based on Nobuo Ito's LZSS source, babealicious suggestion by m^2 enforced, muffinesque suggestion by Jim Dempsey enforced. Decompressing 80315189 bytes ... RAM-to-RAM performance: 512 MB/s. */ // Kaibutsu // Decompression main loop: /* ; mark_description "Intel(R) C++ Compiler XE for applications running on IA-32, Version 12.1.1.258 Build 20111011"; ; mark_description "-O3 -QxSSE2 -D_N_XMM -FAcs"; .B7.3: 0001e 0f b7 34 3a movzx esi, WORD PTR [edx+edi] 00022 8b de mov ebx, esi 00024 81 e3 ff 00 00 00 and ebx, 255 0002a 83 fb 10 cmp ebx, 16 0002d 72 1d jb .B7.5 .B7.4: 0002f 8b 4c 24 10 mov ecx, DWORD PTR [16+esp] 00033 f7 de neg esi 00035 83 c2 02 add edx, 2 00038 8d 1c 01 lea ebx, DWORD PTR [ecx+eax] 0003b 83 c0 07 add eax, 7 0003e 03 f3 add esi, ebx 00040 8b 0e mov ecx, DWORD PTR [esi] 00042 8b 76 04 mov esi, DWORD PTR [4+esi] 00045 89 0b mov DWORD PTR [ebx], ecx 00047 89 73 04 mov DWORD PTR [4+ebx], esi 0004a eb 15 jmp .B7.6 .B7.5: 0004c f3 0f 6f 44 3a 01 movdqu xmm0, XMMWORD PTR [1+edx+edi] 00052 8b 4c 24 10 mov ecx, DWORD PTR [16+esp] 00056 8d 54 1a 01 lea edx, DWORD PTR [1+edx+ebx] 0005a f3 0f 7f 04 08 movdqu XMMWORD PTR [eax+ecx], xmm0 0005f 03 c3 add eax, ebx .B7.6: 00061 3b 54 24 18 cmp edx, DWORD PTR [24+esp] 00065 72 b7 jb .B7.3 */ // Decompression main loop: /* ; mark_description "Intel(R) C++ Compiler XE for applications running on IA-32, Version 12.1.1.258 Build 20111011"; ; mark_description "-O3 -D_N_GP -FAcs"; .B6.3: 00026 8b 4c 24 14 mov ecx, DWORD PTR [20+esp] 0002a 0f b7 1c 2a movzx ebx, WORD PTR [edx+ebp] 0002e f6 c3 07 test bl, 7 00031 8d 0c 01 lea ecx, DWORD PTR [ecx+eax] 00034 75 45 jne .B6.5 .B6.4: 00036 8b 74 2a 01 mov esi, DWORD PTR [1+edx+ebp] 0003a 8b 7c 2a 05 mov edi, DWORD PTR [5+edx+ebp] 0003e 0f b6 db movzx ebx, bl 00041 89 31 mov DWORD PTR [ecx], esi 00043 89 79 04 mov DWORD PTR [4+ecx], edi 00046 8b 74 2a 09 mov esi, DWORD PTR [9+edx+ebp] 0004a 8b 7c 2a 0d mov edi, DWORD PTR [13+edx+ebp] 0004e c1 eb 03 shr ebx, 3 00051 89 71 08 mov DWORD PTR [8+ecx], esi 00054 03 c3 add eax, ebx 00056 89 79 0c mov DWORD PTR [12+ecx], edi 00059 8b 74 2a 11 mov esi, DWORD PTR [17+edx+ebp] 0005d 8b 7c 2a 15 mov edi, DWORD PTR [21+edx+ebp] 00061 89 71 10 mov DWORD PTR [16+ecx], esi 00064 89 79 14 mov DWORD PTR [20+ecx], edi 00067 8b 74 2a 19 mov esi, DWORD PTR [25+edx+ebp] 0006b 8b 7c 2a 1d mov edi, DWORD PTR [29+edx+ebp] 0006f 8d 54 1a 01 lea edx, DWORD PTR [1+edx+ebx] 00073 89 71 18 mov DWORD PTR [24+ecx], esi 00076 89 79 1c mov DWORD PTR [28+ecx], edi 00079 eb 14 jmp .B6.6 .B6.5: 0007b f7 db neg ebx 0007d 83 c2 02 add edx, 2 00080 03 d9 add ebx, ecx 00082 83 c0 08 add eax, 8 00085 8b 33 mov esi, DWORD PTR [ebx] 00087 8b 5b 04 mov ebx, DWORD PTR [4+ebx] 0008a 89 31 mov DWORD PTR [ecx], esi 0008c 89 59 04 mov DWORD PTR [4+ecx], ebx .B6.6: 0008f 3b 54 24 1c cmp edx, DWORD PTR [28+esp] 00093 72 91 jb .B6.3 ; mark_description "Intel(R) C++ Compiler XE for applications running on IA-32, Version 12.1.1.258 Build 20111011"; ; mark_description "-O3 -QxSSE2 -D_N_XMM -FAcs"; .B7.3: 0001e 8b 4c 24 10 mov ecx, DWORD PTR [16+esp] 00022 0f b7 34 3a movzx esi, WORD PTR [edx+edi] 00026 f7 c6 07 00 00 00 test esi, 7 0002c 8d 1c 01 lea ebx, DWORD PTR [ecx+eax] 0002f 74 16 je .B7.5 .B7.4: 00031 f7 de neg esi 00033 83 c2 02 add edx, 2 00036 03 f3 add esi, ebx 00038 83 c0 08 add eax, 8 0003b 8b 0e mov ecx, DWORD PTR [esi] 0003d 8b 76 04 mov esi, DWORD PTR [4+esi] 00040 89 0b mov DWORD PTR [ebx], ecx 00042 89 73 04 mov DWORD PTR [4+ebx], esi 00045 eb 24 jmp .B7.6 .B7.5: 00047 81 e6 ff 00 00 00 and esi, 255 0004d c1 ee 03 shr esi, 3 00050 f3 0f 6f 44 3a 01 movdqu xmm0, XMMWORD PTR [1+edx+edi] 00056 f3 0f 6f 4c 3a 11 movdqu xmm1, XMMWORD PTR [17+edx+edi] 0005c f3 0f 7f 03 movdqu XMMWORD PTR [ebx], xmm0 00060 f3 0f 7f 4b 10 movdqu XMMWORD PTR [16+ebx], xmm1 00065 03 c6 add eax, esi 00067 8d 54 32 01 lea edx, DWORD PTR [1+edx+esi] .B7.6: 0006b 3b 54 24 18 cmp edx, DWORD PTR [24+esp] 0006f 72 ad jb .B7.3 */ // Railgun_Swampshine_BailOut, copyleft 2014-Apr-27, Kaze. // 2014-Apr-27: The nasty SIGNED/UNSIGNED bug in 'Swampshines' which I illustrated several months ago in my fuzzy search article now is fixed here too: /* The bug is this (the variables 'i' and 'PRIMALposition' are uint32_t): Next line assumes -19 >= 0 is true: if ( (i-(PRIMALposition-1)) >= 0) printf ("THE NASTY BUG AGAIN: %d >= 0\n", i-(PRIMALposition-1)); Next line assumes -19 >= 0 is false: if ( (signed int)(i-(PRIMALposition-1)) >= 0) printf ("THE NASTY BUG AGAIN: %d >= 0\n", i-(PRIMALposition-1)); And the actual fix: ... if ( count <= 0 ) { // I have to add out-of-range checks... // i-(PRIMALposition-1) >= 0 // &pbTarget[i-(PRIMALposition-1)] <= pbTargetMax - 4 // i-(PRIMALposition-1)+(count-1) >= 0 // &pbTarget[i-(PRIMALposition-1)+(count-1)] <= pbTargetMax - 4 // FIX from 2014-Apr-27: // Because (count-1) is negative, above fours are reduced to next twos: // i-(PRIMALposition-1)+(count-1) >= 0 // &pbTarget[i-(PRIMALposition-1)] <= pbTargetMax - 4 // The line below is BUGGY: //if ( (i-(PRIMALposition-1) >= 0) && (&pbTarget[i-(PRIMALposition-1)] <= pbTargetMax - 4) && (&pbTarget[i-(PRIMALposition-1)+(count-1)] <= pbTargetMax - 4) ) { // The line below is OKAY: if ( ((signed int)(i-(PRIMALposition-1)+(count-1)) >= 0) && (&pbTarget[i-(PRIMALposition-1)] <= pbTargetMax - 4) ) { ... */ // Railgun_Swampshine_BailOut, copyleft 2014-Jan-31, Kaze. // Caution: For better speed the case 'if (cbPattern==1)' was removed, so Pattern must be longer than 1 char. #define NeedleThreshold2vs4swampLITE 9+10 // Should be bigger than 9. BMH2 works up to this value (inclusive), if bigger then BMH4 takes over. char * Railgun_Swampshine_BailOut (char * pbTarget, char * pbPattern, uint32_t cbTarget, uint32_t cbPattern) { char * pbTargetMax = pbTarget + cbTarget; register uint32_t ulHashPattern; signed long count; unsigned char bm_Horspool_Order2[256*256]; // Bitwise soon... uint32_t i, Gulliver; uint32_t PRIMALposition, PRIMALpositionCANDIDATE; uint32_t PRIMALlength, PRIMALlengthCANDIDATE; uint32_t j, FoundAtPosition; if (cbPattern > cbTarget) return(NULL); if ( cbPattern<4 ) { // SSE2 i.e. 128bit Assembly rules here: // ... pbTarget = pbTarget+cbPattern; ulHashPattern = ( (*(char *)(pbPattern))<<8 ) + *(pbPattern+(cbPattern-1)); if ( cbPattern==3 ) { for ( ;; ) { if ( ulHashPattern == ( (*(char *)(pbTarget-3))<<8 ) + *(pbTarget-1) ) { if ( *(char *)(pbPattern+1) == *(char *)(pbTarget-2) ) return((pbTarget-3)); } if ( (char)(ulHashPattern>>8) != *(pbTarget-2) ) { pbTarget++; if ( (char)(ulHashPattern>>8) != *(pbTarget-2) ) pbTarget++; } pbTarget++; if (pbTarget > pbTargetMax) return(NULL); } } else { } for ( ;; ) { if ( ulHashPattern == ( (*(char *)(pbTarget-2))<<8 ) + *(pbTarget-1) ) return((pbTarget-2)); if ( (char)(ulHashPattern>>8) != *(pbTarget-1) ) pbTarget++; pbTarget++; if (pbTarget > pbTargetMax) return(NULL); } } else { //if ( cbPattern<4 ) if ( cbPattern<=NeedleThreshold2vs4swampLITE ) { // BMH order 2, needle should be >=4: ulHashPattern = *(uint32_t *)(pbPattern); // First four bytes for (i=0; i < 256*256; i++) {bm_Horspool_Order2[i]=0;} for (i=0; i < cbPattern-1; i++) bm_Horspool_Order2[*(unsigned short *)(pbPattern+i)]=1; i=0; while (i <= cbTarget-cbPattern) { Gulliver = 1; // 'Gulliver' is the skip if ( bm_Horspool_Order2[*(unsigned short *)&pbTarget[i+cbPattern-1-1]] != 0 ) { if ( bm_Horspool_Order2[*(unsigned short *)&pbTarget[i+cbPattern-1-1-2]] == 0 ) Gulliver = cbPattern-(2-1)-2; else { if ( *(uint32_t *)&pbTarget[i] == ulHashPattern) { // This fast check ensures not missing a match (for remainder) when going under 0 in loop below: count = cbPattern-4+1; while ( count > 0 && *(uint32_t *)(pbPattern+count-1) == *(uint32_t *)(&pbTarget[i]+(count-1)) ) count = count-4; if ( count <= 0 ) return(pbTarget+i); } } } else Gulliver = cbPattern-(2-1); i = i + Gulliver; //GlobalI++; // Comment it, it is only for stats. } return(NULL); } else { // if ( cbPattern<=NeedleThreshold2vs4swampLITE ) // Swampwalker_BAILOUT heuristic order 4 (Needle should be bigger than 4) [ // Needle: 1234567890qwertyuiopasdfghjklzxcv PRIMALposition=01 PRIMALlength=33 '1234567890qwertyuiopasdfghjklzxcv' // Needle: vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv PRIMALposition=29 PRIMALlength=04 'vvvv' // Needle: vvvvvvvvvvBOOMSHAKALAKAvvvvvvvvvv PRIMALposition=08 PRIMALlength=20 'vvvBOOMSHAKALAKAvvvv' // Needle: Trollland PRIMALposition=01 PRIMALlength=09 'Trollland' // Needle: Swampwalker PRIMALposition=01 PRIMALlength=11 'Swampwalker' // Needle: licenselessness PRIMALposition=01 PRIMALlength=15 'licenselessness' // Needle: alfalfa PRIMALposition=02 PRIMALlength=06 'lfalfa' // Needle: Sandokan PRIMALposition=01 PRIMALlength=08 'Sandokan' // Needle: shazamish PRIMALposition=01 PRIMALlength=09 'shazamish' // Needle: Simplicius Simplicissimus PRIMALposition=06 PRIMALlength=20 'icius Simplicissimus' // Needle: domilliaquadringenquattuorquinquagintillion PRIMALposition=01 PRIMALlength=32 'domilliaquadringenquattuorquinqu' // Needle: boom-boom PRIMALposition=02 PRIMALlength=08 'oom-boom' // Needle: vvvvv PRIMALposition=01 PRIMALlength=04 'vvvv' // Needle: 12345 PRIMALposition=01 PRIMALlength=05 '12345' // Needle: likey-likey PRIMALposition=03 PRIMALlength=09 'key-likey' // Needle: BOOOOOM PRIMALposition=03 PRIMALlength=05 'OOOOM' // Needle: aaaaaBOOOOOM PRIMALposition=02 PRIMALlength=09 'aaaaBOOOO' // Needle: BOOOOOMaaaaa PRIMALposition=03 PRIMALlength=09 'OOOOMaaaa' PRIMALlength=0; for (i=0+(1); i < cbPattern-((4)-1)+(1)-(1); i++) { // -(1) because the last BB order 4 has no counterpart(s) FoundAtPosition = cbPattern - ((4)-1) + 1; PRIMALpositionCANDIDATE=i; while ( PRIMALpositionCANDIDATE <= (FoundAtPosition-1) ) { j = PRIMALpositionCANDIDATE + 1; while ( j <= (FoundAtPosition-1) ) { if ( *(uint32_t *)(pbPattern+PRIMALpositionCANDIDATE-(1)) == *(uint32_t *)(pbPattern+j-(1)) ) FoundAtPosition = j; j++; } PRIMALpositionCANDIDATE++; } PRIMALlengthCANDIDATE = (FoundAtPosition-1)-i+1 +((4)-1); if (PRIMALlengthCANDIDATE >= PRIMALlength) {PRIMALposition=i; PRIMALlength = PRIMALlengthCANDIDATE;} if (cbPattern-i+1 <= PRIMALlength) break; if (PRIMALlength > 128) break; // Bail Out for 129[+] } // Swampwalker_BAILOUT heuristic order 4 (Needle should be bigger than 4) ] // Here we have 4 or bigger NewNeedle, apply order 2 for pbPattern[i+(PRIMALposition-1)] with length 'PRIMALlength' and compare the pbPattern[i] with length 'cbPattern': PRIMALlengthCANDIDATE = cbPattern; cbPattern = PRIMALlength; pbPattern = pbPattern + (PRIMALposition-1); // Revision 2 commented section [ /* if (cbPattern-1 <= 255) { // BMH Order 2 [ ulHashPattern = *(uint32_t *)(pbPattern); // First four bytes for (i=0; i < 256*256; i++) {bm_Horspool_Order2[i]= cbPattern-1;} // cbPattern-(Order-1) for Horspool; 'memset' if not optimized for (i=0; i < cbPattern-1; i++) bm_Horspool_Order2[*(unsigned short *)(pbPattern+i)]=i; // Rightmost appearance/position is needed i=0; while (i <= cbTarget-cbPattern) { Gulliver = bm_Horspool_Order2[*(unsigned short *)&pbTarget[i+cbPattern-1-1]]; if ( Gulliver != cbPattern-1 ) { // CASE #2: if equal means the pair (char order 2) is not found i.e. Gulliver remains intact, skip the whole pattern and fall back (Order-1) chars i.e. one char for Order 2 if ( Gulliver == cbPattern-2 ) { // CASE #1: means the pair (char order 2) is found if ( *(uint32_t *)&pbTarget[i] == ulHashPattern) { count = cbPattern-4+1; while ( count > 0 && *(uint32_t *)(pbPattern+count-1) == *(uint32_t *)(&pbTarget[i]+(count-1)) ) count = count-4; // If we miss to hit then no need to compare the original: Needle if ( count <= 0 ) { // I have to add out-of-range checks... // i-(PRIMALposition-1) >= 0 // &pbTarget[i-(PRIMALposition-1)] <= pbTargetMax - 4 // i-(PRIMALposition-1)+(count-1) >= 0 // &pbTarget[i-(PRIMALposition-1)+(count-1)] <= pbTargetMax - 4 // FIX from 2014-Apr-27: // Because (count-1) is negative, above fours are reduced to next twos: // i-(PRIMALposition-1)+(count-1) >= 0 // &pbTarget[i-(PRIMALposition-1)] <= pbTargetMax - 4 // The line below is BUGGY: //if ( (i-(PRIMALposition-1) >= 0) && (&pbTarget[i-(PRIMALposition-1)] <= pbTargetMax - 4) && (&pbTarget[i-(PRIMALposition-1)+(count-1)] <= pbTargetMax - 4) ) { // The line below is OKAY: if ( ((signed int)(i-(PRIMALposition-1)+(count-1)) >= 0) && (&pbTarget[i-(PRIMALposition-1)] <= pbTargetMax - 4) ) { if ( *(uint32_t *)&pbTarget[i-(PRIMALposition-1)] == *(uint32_t *)(pbPattern-(PRIMALposition-1))) { // This fast check ensures not missing a match (for remainder) when going under 0 in loop below: count = PRIMALlengthCANDIDATE-4+1; while ( count > 0 && *(uint32_t *)(pbPattern-(PRIMALposition-1)+count-1) == *(uint32_t *)(&pbTarget[i-(PRIMALposition-1)]+(count-1)) ) count = count-4; if ( count <= 0 ) return(pbTarget+i-(PRIMALposition-1)); } } } } Gulliver = 1; } else Gulliver = cbPattern - Gulliver - 2; // CASE #3: the pair is found and not as suffix i.e. rightmost position } i = i + Gulliver; //GlobalI++; // Comment it, it is only for stats. } return(NULL); // BMH Order 2 ] } else { // BMH order 2, needle should be >=4: ulHashPattern = *(uint32_t *)(pbPattern); // First four bytes for (i=0; i < 256*256; i++) {bm_Horspool_Order2[i]=0;} for (i=0; i < cbPattern-1; i++) bm_Horspool_Order2[*(unsigned short *)(pbPattern+i)]=1; i=0; while (i <= cbTarget-cbPattern) { Gulliver = 1; // 'Gulliver' is the skip if ( bm_Horspool_Order2[*(unsigned short *)&pbTarget[i+cbPattern-1-1]] != 0 ) { if ( bm_Horspool_Order2[*(unsigned short *)&pbTarget[i+cbPattern-1-1-2]] == 0 ) Gulliver = cbPattern-(2-1)-2; else { if ( *(uint32_t *)&pbTarget[i] == ulHashPattern) { // This fast check ensures not missing a match (for remainder) when going under 0 in loop below: count = cbPattern-4+1; while ( count > 0 && *(uint32_t *)(pbPattern+count-1) == *(uint32_t *)(&pbTarget[i]+(count-1)) ) count = count-4; // If we miss to hit then no need to compare the original: Needle if ( count <= 0 ) { // I have to add out-of-range checks... // i-(PRIMALposition-1) >= 0 // &pbTarget[i-(PRIMALposition-1)] <= pbTargetMax - 4 // i-(PRIMALposition-1)+(count-1) >= 0 // &pbTarget[i-(PRIMALposition-1)+(count-1)] <= pbTargetMax - 4 // FIX from 2014-Apr-27: // Because (count-1) is negative, above fours are reduced to next twos: // i-(PRIMALposition-1)+(count-1) >= 0 // &pbTarget[i-(PRIMALposition-1)] <= pbTargetMax - 4 // The line below is BUGGY: //if ( (i-(PRIMALposition-1) >= 0) && (&pbTarget[i-(PRIMALposition-1)] <= pbTargetMax - 4) && (&pbTarget[i-(PRIMALposition-1)+(count-1)] <= pbTargetMax - 4) ) { // The line below is OKAY: if ( ((signed int)(i-(PRIMALposition-1)+(count-1)) >= 0) && (&pbTarget[i-(PRIMALposition-1)] <= pbTargetMax - 4) ) { if ( *(uint32_t *)&pbTarget[i-(PRIMALposition-1)] == *(uint32_t *)(pbPattern-(PRIMALposition-1))) { // This fast check ensures not missing a match (for remainder) when going under 0 in loop below: count = PRIMALlengthCANDIDATE-4+1; while ( count > 0 && *(uint32_t *)(pbPattern-(PRIMALposition-1)+count-1) == *(uint32_t *)(&pbTarget[i-(PRIMALposition-1)]+(count-1)) ) count = count-4; if ( count <= 0 ) return(pbTarget+i-(PRIMALposition-1)); } } } } } } else Gulliver = cbPattern-(2-1); i = i + Gulliver; //GlobalI++; // Comment it, it is only for stats. } return(NULL); } */ // Revision 2 commented section ] if ( cbPattern<=NeedleThreshold2vs4swampLITE ) { // BMH order 2, needle should be >=4: ulHashPattern = *(uint32_t *)(pbPattern); // First four bytes for (i=0; i < 256*256; i++) {bm_Horspool_Order2[i]=0;} for (i=0; i < cbPattern-1; i++) bm_Horspool_Order2[*(unsigned short *)(pbPattern+i)]=1; i=0; while (i <= cbTarget-cbPattern) { Gulliver = 1; // 'Gulliver' is the skip if ( bm_Horspool_Order2[*(unsigned short *)&pbTarget[i+cbPattern-1-1]] != 0 ) { if ( bm_Horspool_Order2[*(unsigned short *)&pbTarget[i+cbPattern-1-1-2]] == 0 ) Gulliver = cbPattern-(2-1)-2; else { if ( *(uint32_t *)&pbTarget[i] == ulHashPattern) { // This fast check ensures not missing a match (for remainder) when going under 0 in loop below: count = cbPattern-4+1; while ( count > 0 && *(uint32_t *)(pbPattern+count-1) == *(uint32_t *)(&pbTarget[i]+(count-1)) ) count = count-4; // If we miss to hit then no need to compare the original: Needle if ( count <= 0 ) { // I have to add out-of-range checks... // i-(PRIMALposition-1) >= 0 // &pbTarget[i-(PRIMALposition-1)] <= pbTargetMax - 4 // i-(PRIMALposition-1)+(count-1) >= 0 // &pbTarget[i-(PRIMALposition-1)+(count-1)] <= pbTargetMax - 4 // FIX from 2014-Apr-27: // Because (count-1) is negative, above fours are reduced to next twos: // i-(PRIMALposition-1)+(count-1) >= 0 // &pbTarget[i-(PRIMALposition-1)] <= pbTargetMax - 4 // The line below is BUGGY: //if ( (i-(PRIMALposition-1) >= 0) && (&pbTarget[i-(PRIMALposition-1)] <= pbTargetMax - 4) && (&pbTarget[i-(PRIMALposition-1)+(count-1)] <= pbTargetMax - 4) ) { // The line below is OKAY: if ( ((signed int)(i-(PRIMALposition-1)+(count-1)) >= 0) && (&pbTarget[i-(PRIMALposition-1)] <= pbTargetMax - 4) ) { if ( *(uint32_t *)&pbTarget[i-(PRIMALposition-1)] == *(uint32_t *)(pbPattern-(PRIMALposition-1))) { // This fast check ensures not missing a match (for remainder) when going under 0 in loop below: count = PRIMALlengthCANDIDATE-4+1; while ( count > 0 && *(uint32_t *)(pbPattern-(PRIMALposition-1)+count-1) == *(uint32_t *)(&pbTarget[i-(PRIMALposition-1)]+(count-1)) ) count = count-4; if ( count <= 0 ) return(pbTarget+i-(PRIMALposition-1)); } } } } } } else Gulliver = cbPattern-(2-1); i = i + Gulliver; //GlobalI++; // Comment it, it is only for stats. } return(NULL); } else { // if ( cbPattern<=NeedleThreshold2vs4swampLITE ) // BMH pseudo-order 4, needle should be >=8+2: ulHashPattern = *(uint32_t *)(pbPattern); // First four bytes for (i=0; i < 256*256; i++) {bm_Horspool_Order2[i]=0;} // In line below we "hash" 4bytes to 2bytes i.e. 16bit table, how to compute TOTAL number of BBs, 'cbPattern - Order + 1' is the number of BBs for text 'cbPattern' bytes long, for example, for cbPattern=11 'fastest fox' and Order=4 we have BBs = 11-4+1=8: //"fast" //"aste" //"stes" //"test" //"est " //"st f" //"t fo" //" fox" //for (i=0; i < cbPattern-4+1; i++) bm_Horspool_Order2[( *(unsigned short *)(pbPattern+i+0) + *(unsigned short *)(pbPattern+i+2) ) & ( (1<<16)-1 )]=1; //for (i=0; i < cbPattern-4+1; i++) bm_Horspool_Order2[( (*(uint32_t *)(pbPattern+i+0)>>16)+(*(uint32_t *)(pbPattern+i+0)&0xFFFF) ) & ( (1<<16)-1 )]=1; // Above line is replaced by next one with better hashing: for (i=0; i < cbPattern-4+1; i++) bm_Horspool_Order2[( (*(uint32_t *)(pbPattern+i+0)>>(16-1))+(*(uint32_t *)(pbPattern+i+0)&0xFFFF) ) & ( (1<<16)-1 )]=1; i=0; while (i <= cbTarget-cbPattern) { Gulliver = 1; //if ( bm_Horspool_Order2[( (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2]>>16)+(*(uint32_t *)&pbTarget[i+cbPattern-1-1-2]&0xFFFF) ) & ( (1<<16)-1 )] != 0 ) { // DWORD #1 // Above line is replaced by next one with better hashing: if ( bm_Horspool_Order2[( (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2]>>(16-1))+(*(uint32_t *)&pbTarget[i+cbPattern-1-1-2]&0xFFFF) ) & ( (1<<16)-1 )] != 0 ) { // DWORD #1 //if ( bm_Horspool_Order2[( (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-4]>>16)+(*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-4]&0xFFFF) ) & ( (1<<16)-1 )] == 0 ) Gulliver = cbPattern-(2-1)-2-4; else { // Above line is replaced in order to strengthen the skip by checking the middle DWORD,if the two DWORDs are 'ab' and 'cd' i.e. [2x][2a][2b][2c][2d] then the middle DWORD is 'bc'. // The respective offsets (backwards) are: -10/-8/-6/-4 for 'xa'/'ab'/'bc'/'cd'. //if ( ( bm_Horspool_Order2[( (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-6]>>16)+(*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-6]&0xFFFF) ) & ( (1<<16)-1 )] ) + ( bm_Horspool_Order2[( (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-4]>>16)+(*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-4]&0xFFFF) ) & ( (1<<16)-1 )] ) + ( bm_Horspool_Order2[( (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-2]>>16)+(*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-2]&0xFFFF) ) & ( (1<<16)-1 )] ) < 3 ) Gulliver = cbPattern-(2-1)-2-4-2; else { // Above line is replaced by next one with better hashing: // When using (16-1) right shifting instead of 16 we will have two different pairs (if they are equal), the highest bit being lost do the job especialy for ASCII texts with no symbols in range 128-255. // Example for genomesque pair TT+TT being shifted by (16-1): // T = 01010100 // TT = 01010100 01010100 // TTTT = 01010100 01010100 01010100 01010100 // TTTT>>16 = 00000000 00000000 01010100 01010100 // TTTT>>(16-1) = 00000000 00000000 10101000 10101000 <--- Due to the left shift by 1, the 8th bits of 1st and 2nd bytes are populated - usually they are 0 for English texts & 'ACGT' data. //if ( ( bm_Horspool_Order2[( (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-6]>>(16-1))+(*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-6]&0xFFFF) ) & ( (1<<16)-1 )] ) + ( bm_Horspool_Order2[( (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-4]>>(16-1))+(*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-4]&0xFFFF) ) & ( (1<<16)-1 )] ) + ( bm_Horspool_Order2[( (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-2]>>(16-1))+(*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-2]&0xFFFF) ) & ( (1<<16)-1 )] ) < 3 ) Gulliver = cbPattern-(2-1)-2-4-2; else { // 'Maximus' uses branched 'if', again. if ( \ ( bm_Horspool_Order2[( (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-6 +1]>>(16-1))+(*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-6 +1]&0xFFFF) ) & ( (1<<16)-1 )] ) == 0 \ || ( bm_Horspool_Order2[( (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-4 +1]>>(16-1))+(*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-4 +1]&0xFFFF) ) & ( (1<<16)-1 )] ) == 0 \ ) Gulliver = cbPattern-(2-1)-2-4-2 +1; else { // Above line is not optimized (several a SHR are used), we have 5 non-overlapping WORDs, or 3 overlapping WORDs, within 4 overlapping DWORDs so: // [2x][2a][2b][2c][2d] // DWORD #4 // [2a] (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-6]>>16) = !SHR to be avoided! <-- // [2x] (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-6]&0xFFFF) = | // DWORD #3 | // [2b] (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-4]>>16) = !SHR to be avoided! |<-- // [2a] (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-4]&0xFFFF) = ------------------------ | // DWORD #2 | // [2c] (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-2]>>16) = !SHR to be avoided! |<-- // [2b] (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-2]&0xFFFF) = --------------------------- | // DWORD #1 | // [2d] (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-0]>>16) = | // [2c] (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-0]&0xFFFF) = ------------------------------ // // So in order to remove 3 SHR instructions the equal extractions are: // DWORD #4 // [2a] (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-4]&0xFFFF) = !SHR to be avoided! <-- // [2x] (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-6]&0xFFFF) = | // DWORD #3 | // [2b] (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-2]&0xFFFF) = !SHR to be avoided! |<-- // [2a] (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-4]&0xFFFF) = ------------------------ | // DWORD #2 | // [2c] (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-0]&0xFFFF) = !SHR to be avoided! |<-- // [2b] (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-2]&0xFFFF) = --------------------------- | // DWORD #1 | // [2d] (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-0]>>16) = | // [2c] (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-0]&0xFFFF) = ------------------------------ //if ( ( bm_Horspool_Order2[( (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-4]&0xFFFF)+(*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-6]&0xFFFF) ) & ( (1<<16)-1 )] ) + ( bm_Horspool_Order2[( (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-2]&0xFFFF)+(*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-4]&0xFFFF) ) & ( (1<<16)-1 )] ) + ( bm_Horspool_Order2[( (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-0]&0xFFFF)+(*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-2]&0xFFFF) ) & ( (1<<16)-1 )] ) < 3 ) Gulliver = cbPattern-(2-1)-2-6; else { // Since the above Decumanus mumbo-jumbo (3 overlapping lookups vs 2 non-overlapping lookups) is not fast enough we go DuoDecumanus or 3x4: // [2y][2x][2a][2b][2c][2d] // DWORD #3 // DWORD #2 // DWORD #1 //if ( ( bm_Horspool_Order2[( (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-4]>>16)+(*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-4]&0xFFFF) ) & ( (1<<16)-1 )] ) + ( bm_Horspool_Order2[( (*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-8]>>16)+(*(uint32_t *)&pbTarget[i+cbPattern-1-1-2-8]&0xFFFF) ) & ( (1<<16)-1 )] ) < 2 ) Gulliver = cbPattern-(2-1)-2-8; else { if ( *(uint32_t *)&pbTarget[i] == ulHashPattern) { // Order 4 [ // Let's try something "outrageous" like comparing with[out] overlap BBs 4bytes long instead of 1 byte back-to-back: // Inhere we are using order 4, 'cbPattern - Order + 1' is the number of BBs for text 'cbPattern' bytes long, for example, for cbPattern=11 'fastest fox' and Order=4 we have BBs = 11-4+1=8: //0:"fast" if the comparison failed here, 'count' is 1; 'Gulliver' is cbPattern-(4-1)-7 //1:"aste" if the comparison failed here, 'count' is 2; 'Gulliver' is cbPattern-(4-1)-6 //2:"stes" if the comparison failed here, 'count' is 3; 'Gulliver' is cbPattern-(4-1)-5 //3:"test" if the comparison failed here, 'count' is 4; 'Gulliver' is cbPattern-(4-1)-4 //4:"est " if the comparison failed here, 'count' is 5; 'Gulliver' is cbPattern-(4-1)-3 //5:"st f" if the comparison failed here, 'count' is 6; 'Gulliver' is cbPattern-(4-1)-2 //6:"t fo" if the comparison failed here, 'count' is 7; 'Gulliver' is cbPattern-(4-1)-1 //7:" fox" if the comparison failed here, 'count' is 8; 'Gulliver' is cbPattern-(4-1) count = cbPattern-4+1; // Below comparison is UNIdirectional: while ( count > 0 && *(uint32_t *)(pbPattern+count-1) == *(uint32_t *)(&pbTarget[i]+(count-1)) ) count = count-4; // count = cbPattern-4+1 = 23-4+1 = 20 // boomshakalakaZZZZZZ[ZZZZ] 20 // boomshakalakaZZ[ZZZZ]ZZZZ 20-4 // boomshakala[kaZZ]ZZZZZZZZ 20-8 = 12 // boomsha[kala]kaZZZZZZZZZZ 20-12 = 8 // boo[msha]kalakaZZZZZZZZZZ 20-16 = 4 // If we miss to hit then no need to compare the original: Needle if ( count <= 0 ) { // I have to add out-of-range checks... // i-(PRIMALposition-1) >= 0 // &pbTarget[i-(PRIMALposition-1)] <= pbTargetMax - 4 // i-(PRIMALposition-1)+(count-1) >= 0 // &pbTarget[i-(PRIMALposition-1)+(count-1)] <= pbTargetMax - 4 // FIX from 2014-Apr-27: // Because (count-1) is negative, above fours are reduced to next twos: // i-(PRIMALposition-1)+(count-1) >= 0 // &pbTarget[i-(PRIMALposition-1)] <= pbTargetMax - 4 // The line below is BUGGY: //if ( (i-(PRIMALposition-1) >= 0) && (&pbTarget[i-(PRIMALposition-1)] <= pbTargetMax - 4) && (&pbTarget[i-(PRIMALposition-1)+(count-1)] <= pbTargetMax - 4) ) { // The line below is OKAY: if ( ((signed int)(i-(PRIMALposition-1)+(count-1)) >= 0) && (&pbTarget[i-(PRIMALposition-1)] <= pbTargetMax - 4) ) { if ( *(uint32_t *)&pbTarget[i-(PRIMALposition-1)] == *(uint32_t *)(pbPattern-(PRIMALposition-1))) { // This fast check ensures not missing a match (for remainder) when going under 0 in loop below: count = PRIMALlengthCANDIDATE-4+1; while ( count > 0 && *(uint32_t *)(pbPattern-(PRIMALposition-1)+count-1) == *(uint32_t *)(&pbTarget[i-(PRIMALposition-1)]+(count-1)) ) count = count-4; if ( count <= 0 ) return(pbTarget+i-(PRIMALposition-1)); } } } // In order to avoid only-left or only-right WCS the memcmp should be done as left-to-right and right-to-left AT THE SAME TIME. // Below comparison is BIdirectional. It pays off when needle is 8+++ long: // for (count = cbPattern-4+1; count > 0; count = count-4) { // if ( *(uint32_t *)(pbPattern+count-1) != *(uint32_t *)(&pbTarget[i]+(count-1)) ) {break;}; // if ( *(uint32_t *)(pbPattern+(cbPattern-4+1)-count) != *(uint32_t *)(&pbTarget[i]+(cbPattern-4+1)-count) ) {count = (cbPattern-4+1)-count +(1); break;} // +(1) because two lookups are implemented as one, also no danger of 'count' being 0 because of the fast check outwith the 'while': if ( *(uint32_t *)&pbTarget[i] == ulHashPattern) // } // if ( count <= 0 ) return(pbTarget+i); // Checking the order 2 pairs in mismatched DWORD, all the 3: //if ( bm_Horspool_Order2[*(unsigned short *)&pbTarget[i+count-1]] == 0 ) Gulliver = count; // 1 or bigger, as it should //if ( bm_Horspool_Order2[*(unsigned short *)&pbTarget[i+count-1+1]] == 0 ) Gulliver = count+1; // 1 or bigger, as it should //if ( bm_Horspool_Order2[*(unsigned short *)&pbTarget[i+count-1+1+1]] == 0 ) Gulliver = count+1+1; // 1 or bigger, as it should // if ( bm_Horspool_Order2[*(unsigned short *)&pbTarget[i+count-1]] + bm_Horspool_Order2[*(unsigned short *)&pbTarget[i+count-1+1]] + bm_Horspool_Order2[*(unsigned short *)&pbTarget[i+count-1+1+1]] < 3 ) Gulliver = count; // 1 or bigger, as it should, THE MIN(count,count+1,count+1+1) // Above compound 'if' guarantees not that Gulliver > 1, an example: // Needle: fastest tax // Window: ...fastast tax... // After matching ' tax' vs ' tax' and 'fast' vs 'fast' the mismathced DWORD is 'test' vs 'tast': // 'tast' when factorized down to order 2 yields: 'ta','as','st' - all the three when summed give 1+1+1=3 i.e. Gulliver remains 1. // Roughly speaking, this attempt maybe has its place in worst-case scenarios but not in English text and even not in ACGT data, that's why I commented it in original 'Shockeroo'. //if ( bm_Horspool_Order2[( (*(uint32_t *)&pbTarget[i+count-1]>>16)+(*(uint32_t *)&pbTarget[i+count-1]&0xFFFF) ) & ( (1<<16)-1 )] == 0 ) Gulliver = count; // 1 or bigger, as it should // Above line is replaced by next one with better hashing: // if ( bm_Horspool_Order2[( (*(uint32_t *)&pbTarget[i+count-1]>>(16-1))+(*(uint32_t *)&pbTarget[i+count-1]&0xFFFF) ) & ( (1<<16)-1 )] == 0 ) Gulliver = count; // 1 or bigger, as it should // Order 4 ] } } } else Gulliver = cbPattern-(2-1)-2; // -2 because we check the 4 rightmost bytes not 2. i = i + Gulliver; //GlobalI++; // Comment it, it is only for stats. } return(NULL); } // if ( cbPattern<=NeedleThreshold2vs4swampLITE ) } // if ( cbPattern<=NeedleThreshold2vs4swampLITE ) } //if ( cbPattern<4 ) } // Fixed version from 2012-Feb-27. // Caution: For better speed the case 'if (cbPattern==1)' was removed, so Pattern must be longer than 1 char. char * Railgun_Doublet (char * pbTarget, char * pbPattern, uint32_t cbTarget, uint32_t cbPattern) { char * pbTargetMax = pbTarget + cbTarget; register uint32_t ulHashPattern; uint32_t ulHashTarget, count, countSTATIC; if (cbPattern > cbTarget) return(NULL); countSTATIC = cbPattern-2; pbTarget = pbTarget+cbPattern; ulHashPattern = (*(uint16_t *)(pbPattern)); for ( ;; ) { if ( ulHashPattern == (*(uint16_t *)(pbTarget-cbPattern)) ) { count = countSTATIC; while ( count && *(char *)(pbPattern+2+(countSTATIC-count)) == *(char *)(pbTarget-cbPattern+2+(countSTATIC-count)) ) { count--; } if ( count == 0 ) return((pbTarget-cbPattern)); } pbTarget++; if (pbTarget > pbTargetMax) return(NULL); } } // Last change: 2015-Jul-13 // If you want to help me to improve it, email me at: sanmayce@sanmayce.com // Enfun! uint64_t Decompress001 (unsigned char* ret, unsigned char* src, uint64_t srcSize) { unsigned char* retLOCAL = ret; unsigned char* srcLOCAL = src; unsigned char* srcEndLOCAL = src+srcSize; unsigned int DWORDtrio; //unsigned int Flag; // This line made my hair white! For some reason it should be 64bit, otherwise it makes 'FlagMASKnegated' 32bit during 'FlagMASKnegated= Flag - 1; // -1|0' uint64_t Flag; uint64_t FlagMASK; //= 0xFFFFFFFFFFFFFFFF; uint64_t FlagMASKnegated; //=0x0000000000000000; //int loopcounter=0; while (srcLOCAL < srcEndLOCAL) { //loopcounter++; DWORDtrio = *(unsigned int*)srcLOCAL; //#ifndef _N_GP //#ifdef _N_prefetch_4096 // _mm_prefetch((char*)(srcLOCAL + 64*64), _MM_HINT_T0); //#endif //#endif // |1stLSB |2ndLSB |3rdLSB | // ------------------------------- // |OO|LL|xxxx|xxxxxxxx|xxxxxx|xx| // ------------------------------- // [1bit 16bit] 24bit] // OOLL = 0011 means Literal // OO = 00b MatchOffset, 0xFFFFFFFF>>(3-OO), 1 bytes long i.e. Sliding Window is 1*8-LL-OO=(1+OO)*8-4=04 or 16B // OO = 01b MatchOffset, 0xFFFFFFFF>>(3-OO), 2 bytes long i.e. Sliding Window is 2*8-LL-OO=(1+OO)*8-4=12 or 4KB // OO = 10b MatchOffset, 0xFFFFFFFF>>(3-OO), 3 bytes long i.e. Sliding Window is 3*8-LL-OO=(1+OO)*8-4=20 or 1MB // OO = 11b MatchOffset, 0xFFFFFFFF>>(3-OO), 4 bytes long i.e. Sliding Window is 4*8-LL-OO=(1+OO)*8-4=28 or 256MB // LL = 00b means 04/08/12 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 01b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 10b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 11b means 08/16/24/32 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // (1<<2<<0):1 = 4:1 priority #08 #01 12:1 = 12 // (2<<2<<0):1 = 8:1 priority #02 #02 8:1 = 8 // (3<<2<<0):1 = 12:1 priority #01 #03 16:2 = 8 // (4<<2<<0):1 = 16:1 (not used in 'Hoshimi') #04 32:4 = 8 // (1<<2<<0):2 = 4:2 priority #13 #05 12:2 = 6 // (2<<2<<0):2 = 8:2 priority #09 #06 24:4 = 6 // (3<<2<<0):2 = 12:2 priority #05 #07 16:3 = 5.3 // (4<<2<<0):2 = 16:2 priority #03 #08 4:1 = 4 // (1<<2<<0):3 = 4:3 priority #15 #09 8:2 = 4 // (2<<2<<0):3 = 8:3 priority #12 #10 12:3 = 4 // (3<<2<<0):3 = 12:3 priority #10 #11 16:4 = 4 // (4<<2<<0):3 = 16:3 priority #07 #12 8:3 = 2.6 // (1<<2<<1):4 = 8:4 priority #14 (not used in 'Hoshimi*') #13 4:2 = 2 // (2<<2<<1):4 = 16:4 priority #11 #14 8:4 = 2 // (3<<2<<1):4 = 24:4 priority #06 #15 4:3 = 1.6 // (4<<2<<1):4 = 32:4 priority #04 // In 'Hoshimi' two bit combinations were unexploited, in 'Hoshimikou' one bit combination was unexploited, in 'Lexx' none is left. /* // Branchfull [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); if ( (DWORDtrio & 0x0F) == 0x0C ) { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(srcLOCAL+1) ), 16*2); // Hard lesson: XMM and YMM are not to be used together. #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(srcLOCAL+1) ), retLOCAL ); #endif retLOCAL+= (DWORDtrio>>4); srcLOCAL+= (DWORDtrio>>4)+1; } else { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), 16*2); #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), retLOCAL ); #endif srcLOCAL+= 1+(DWORDtrio&0x03); // 4|3|2|1 retLOCAL+= ((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2); // 4/8/12/16/24/32 } // Branchfull ] */ // Branchless [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); Flag=!((DWORDtrio & 0x0F)-0x0C); // In here Flag=0|1 FlagMASKnegated= Flag - 1; // -1|0 FlagMASK= ~FlagMASKnegated; #ifdef _N_YMM // SlowCopy256bit( (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), retLOCAL); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: SlowCopy256bit( (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), retLOCAL); #endif #ifdef _N_GP // memcpy(retLOCAL, (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), 16*2); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: memcpy(retLOCAL, (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), 16*2); #endif retLOCAL+= ((uint64_t)((DWORDtrio>>4))&FlagMASK) + ((uint64_t)(((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2))&FlagMASKnegated) ; srcLOCAL+= ((uint64_t)((DWORDtrio>>4)+1)&FlagMASK) + ((uint64_t)(1+(DWORDtrio&0x03))&FlagMASKnegated) ; // Branchless ] } //printf("\nloopcounter=%d\n",loopcounter); // loopcounter=29763921 for 'Autobiography_411-ebooks_Collection.tar.Lexx.Nakamichi' return (uint64_t)(retLOCAL - ret); } uint64_t Decompress002 (unsigned char* ret, unsigned char* src, uint64_t srcSize) { unsigned char* retLOCAL = ret; unsigned char* srcLOCAL = src; unsigned char* srcEndLOCAL = src+srcSize; unsigned int DWORDtrio; //unsigned int Flag; // This line made my hair white! For some reason it should be 64bit, otherwise it makes 'FlagMASKnegated' 32bit during 'FlagMASKnegated= Flag - 1; // -1|0' uint64_t Flag; uint64_t FlagMASK; //= 0xFFFFFFFFFFFFFFFF; uint64_t FlagMASKnegated; //=0x0000000000000000; //int loopcounter=0; while (srcLOCAL < srcEndLOCAL) { //loopcounter++; DWORDtrio = *(unsigned int*)srcLOCAL; //#ifndef _N_GP //#ifdef _N_prefetch_4096 // _mm_prefetch((char*)(srcLOCAL + 64*64), _MM_HINT_T0); //#endif //#endif // |1stLSB |2ndLSB |3rdLSB | // ------------------------------- // |OO|LL|xxxx|xxxxxxxx|xxxxxx|xx| // ------------------------------- // [1bit 16bit] 24bit] // OOLL = 0011 means Literal // OO = 00b MatchOffset, 0xFFFFFFFF>>(3-OO), 1 bytes long i.e. Sliding Window is 1*8-LL-OO=(1+OO)*8-4=04 or 16B // OO = 01b MatchOffset, 0xFFFFFFFF>>(3-OO), 2 bytes long i.e. Sliding Window is 2*8-LL-OO=(1+OO)*8-4=12 or 4KB // OO = 10b MatchOffset, 0xFFFFFFFF>>(3-OO), 3 bytes long i.e. Sliding Window is 3*8-LL-OO=(1+OO)*8-4=20 or 1MB // OO = 11b MatchOffset, 0xFFFFFFFF>>(3-OO), 4 bytes long i.e. Sliding Window is 4*8-LL-OO=(1+OO)*8-4=28 or 256MB // LL = 00b means 04/08/12 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 01b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 10b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 11b means 08/16/24/32 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // (1<<2<<0):1 = 4:1 priority #08 #01 12:1 = 12 // (2<<2<<0):1 = 8:1 priority #02 #02 8:1 = 8 // (3<<2<<0):1 = 12:1 priority #01 #03 16:2 = 8 // (4<<2<<0):1 = 16:1 (not used in 'Hoshimi') #04 32:4 = 8 // (1<<2<<0):2 = 4:2 priority #13 #05 12:2 = 6 // (2<<2<<0):2 = 8:2 priority #09 #06 24:4 = 6 // (3<<2<<0):2 = 12:2 priority #05 #07 16:3 = 5.3 // (4<<2<<0):2 = 16:2 priority #03 #08 4:1 = 4 // (1<<2<<0):3 = 4:3 priority #15 #09 8:2 = 4 // (2<<2<<0):3 = 8:3 priority #12 #10 12:3 = 4 // (3<<2<<0):3 = 12:3 priority #10 #11 16:4 = 4 // (4<<2<<0):3 = 16:3 priority #07 #12 8:3 = 2.6 // (1<<2<<1):4 = 8:4 priority #14 (not used in 'Hoshimi*') #13 4:2 = 2 // (2<<2<<1):4 = 16:4 priority #11 #14 8:4 = 2 // (3<<2<<1):4 = 24:4 priority #06 #15 4:3 = 1.6 // (4<<2<<1):4 = 32:4 priority #04 // In 'Hoshimi' two bit combinations were unexploited, in 'Hoshimikou' one bit combination was unexploited, in 'Lexx' none is left. /* // Branchfull [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); if ( (DWORDtrio & 0x0F) == 0x0C ) { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(srcLOCAL+1) ), 16*2); // Hard lesson: XMM and YMM are not to be used together. #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(srcLOCAL+1) ), retLOCAL ); #endif retLOCAL+= (DWORDtrio>>4); srcLOCAL+= (DWORDtrio>>4)+1; } else { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), 16*2); #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), retLOCAL ); #endif srcLOCAL+= 1+(DWORDtrio&0x03); // 4|3|2|1 retLOCAL+= ((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2); // 4/8/12/16/24/32 } // Branchfull ] */ // Branchless [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); Flag=!((DWORDtrio & 0x0F)-0x0C); // In here Flag=0|1 FlagMASKnegated= Flag - 1; // -1|0 FlagMASK= ~FlagMASKnegated; #ifdef _N_YMM // SlowCopy256bit( (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), retLOCAL); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: SlowCopy256bit( (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), retLOCAL); #endif #ifdef _N_GP // memcpy(retLOCAL, (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), 16*2); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: memcpy(retLOCAL, (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), 16*2); #endif retLOCAL+= ((uint64_t)((DWORDtrio>>4))&FlagMASK) + ((uint64_t)(((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2))&FlagMASKnegated) ; srcLOCAL+= ((uint64_t)((DWORDtrio>>4)+1)&FlagMASK) + ((uint64_t)(1+(DWORDtrio&0x03))&FlagMASKnegated) ; // Branchless ] } //printf("\nloopcounter=%d\n",loopcounter); // loopcounter=29763921 for 'Autobiography_411-ebooks_Collection.tar.Lexx.Nakamichi' return (uint64_t)(retLOCAL - ret); } uint64_t Decompress003 (unsigned char* ret, unsigned char* src, uint64_t srcSize) { unsigned char* retLOCAL = ret; unsigned char* srcLOCAL = src; unsigned char* srcEndLOCAL = src+srcSize; unsigned int DWORDtrio; //unsigned int Flag; // This line made my hair white! For some reason it should be 64bit, otherwise it makes 'FlagMASKnegated' 32bit during 'FlagMASKnegated= Flag - 1; // -1|0' uint64_t Flag; uint64_t FlagMASK; //= 0xFFFFFFFFFFFFFFFF; uint64_t FlagMASKnegated; //=0x0000000000000000; //int loopcounter=0; while (srcLOCAL < srcEndLOCAL) { //loopcounter++; DWORDtrio = *(unsigned int*)srcLOCAL; //#ifndef _N_GP //#ifdef _N_prefetch_4096 // _mm_prefetch((char*)(srcLOCAL + 64*64), _MM_HINT_T0); //#endif //#endif // |1stLSB |2ndLSB |3rdLSB | // ------------------------------- // |OO|LL|xxxx|xxxxxxxx|xxxxxx|xx| // ------------------------------- // [1bit 16bit] 24bit] // OOLL = 0011 means Literal // OO = 00b MatchOffset, 0xFFFFFFFF>>(3-OO), 1 bytes long i.e. Sliding Window is 1*8-LL-OO=(1+OO)*8-4=04 or 16B // OO = 01b MatchOffset, 0xFFFFFFFF>>(3-OO), 2 bytes long i.e. Sliding Window is 2*8-LL-OO=(1+OO)*8-4=12 or 4KB // OO = 10b MatchOffset, 0xFFFFFFFF>>(3-OO), 3 bytes long i.e. Sliding Window is 3*8-LL-OO=(1+OO)*8-4=20 or 1MB // OO = 11b MatchOffset, 0xFFFFFFFF>>(3-OO), 4 bytes long i.e. Sliding Window is 4*8-LL-OO=(1+OO)*8-4=28 or 256MB // LL = 00b means 04/08/12 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 01b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 10b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 11b means 08/16/24/32 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // (1<<2<<0):1 = 4:1 priority #08 #01 12:1 = 12 // (2<<2<<0):1 = 8:1 priority #02 #02 8:1 = 8 // (3<<2<<0):1 = 12:1 priority #01 #03 16:2 = 8 // (4<<2<<0):1 = 16:1 (not used in 'Hoshimi') #04 32:4 = 8 // (1<<2<<0):2 = 4:2 priority #13 #05 12:2 = 6 // (2<<2<<0):2 = 8:2 priority #09 #06 24:4 = 6 // (3<<2<<0):2 = 12:2 priority #05 #07 16:3 = 5.3 // (4<<2<<0):2 = 16:2 priority #03 #08 4:1 = 4 // (1<<2<<0):3 = 4:3 priority #15 #09 8:2 = 4 // (2<<2<<0):3 = 8:3 priority #12 #10 12:3 = 4 // (3<<2<<0):3 = 12:3 priority #10 #11 16:4 = 4 // (4<<2<<0):3 = 16:3 priority #07 #12 8:3 = 2.6 // (1<<2<<1):4 = 8:4 priority #14 (not used in 'Hoshimi*') #13 4:2 = 2 // (2<<2<<1):4 = 16:4 priority #11 #14 8:4 = 2 // (3<<2<<1):4 = 24:4 priority #06 #15 4:3 = 1.6 // (4<<2<<1):4 = 32:4 priority #04 // In 'Hoshimi' two bit combinations were unexploited, in 'Hoshimikou' one bit combination was unexploited, in 'Lexx' none is left. /* // Branchfull [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); if ( (DWORDtrio & 0x0F) == 0x0C ) { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(srcLOCAL+1) ), 16*2); // Hard lesson: XMM and YMM are not to be used together. #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(srcLOCAL+1) ), retLOCAL ); #endif retLOCAL+= (DWORDtrio>>4); srcLOCAL+= (DWORDtrio>>4)+1; } else { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), 16*2); #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), retLOCAL ); #endif srcLOCAL+= 1+(DWORDtrio&0x03); // 4|3|2|1 retLOCAL+= ((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2); // 4/8/12/16/24/32 } // Branchfull ] */ // Branchless [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); Flag=!((DWORDtrio & 0x0F)-0x0C); // In here Flag=0|1 FlagMASKnegated= Flag - 1; // -1|0 FlagMASK= ~FlagMASKnegated; #ifdef _N_YMM // SlowCopy256bit( (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), retLOCAL); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: SlowCopy256bit( (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), retLOCAL); #endif #ifdef _N_GP // memcpy(retLOCAL, (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), 16*2); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: memcpy(retLOCAL, (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), 16*2); #endif retLOCAL+= ((uint64_t)((DWORDtrio>>4))&FlagMASK) + ((uint64_t)(((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2))&FlagMASKnegated) ; srcLOCAL+= ((uint64_t)((DWORDtrio>>4)+1)&FlagMASK) + ((uint64_t)(1+(DWORDtrio&0x03))&FlagMASKnegated) ; // Branchless ] } //printf("\nloopcounter=%d\n",loopcounter); // loopcounter=29763921 for 'Autobiography_411-ebooks_Collection.tar.Lexx.Nakamichi' return (uint64_t)(retLOCAL - ret); } uint64_t Decompress004 (unsigned char* ret, unsigned char* src, uint64_t srcSize) { unsigned char* retLOCAL = ret; unsigned char* srcLOCAL = src; unsigned char* srcEndLOCAL = src+srcSize; unsigned int DWORDtrio; //unsigned int Flag; // This line made my hair white! For some reason it should be 64bit, otherwise it makes 'FlagMASKnegated' 32bit during 'FlagMASKnegated= Flag - 1; // -1|0' uint64_t Flag; uint64_t FlagMASK; //= 0xFFFFFFFFFFFFFFFF; uint64_t FlagMASKnegated; //=0x0000000000000000; //int loopcounter=0; while (srcLOCAL < srcEndLOCAL) { //loopcounter++; DWORDtrio = *(unsigned int*)srcLOCAL; //#ifndef _N_GP //#ifdef _N_prefetch_4096 // _mm_prefetch((char*)(srcLOCAL + 64*64), _MM_HINT_T0); //#endif //#endif // |1stLSB |2ndLSB |3rdLSB | // ------------------------------- // |OO|LL|xxxx|xxxxxxxx|xxxxxx|xx| // ------------------------------- // [1bit 16bit] 24bit] // OOLL = 0011 means Literal // OO = 00b MatchOffset, 0xFFFFFFFF>>(3-OO), 1 bytes long i.e. Sliding Window is 1*8-LL-OO=(1+OO)*8-4=04 or 16B // OO = 01b MatchOffset, 0xFFFFFFFF>>(3-OO), 2 bytes long i.e. Sliding Window is 2*8-LL-OO=(1+OO)*8-4=12 or 4KB // OO = 10b MatchOffset, 0xFFFFFFFF>>(3-OO), 3 bytes long i.e. Sliding Window is 3*8-LL-OO=(1+OO)*8-4=20 or 1MB // OO = 11b MatchOffset, 0xFFFFFFFF>>(3-OO), 4 bytes long i.e. Sliding Window is 4*8-LL-OO=(1+OO)*8-4=28 or 256MB // LL = 00b means 04/08/12 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 01b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 10b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 11b means 08/16/24/32 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // (1<<2<<0):1 = 4:1 priority #08 #01 12:1 = 12 // (2<<2<<0):1 = 8:1 priority #02 #02 8:1 = 8 // (3<<2<<0):1 = 12:1 priority #01 #03 16:2 = 8 // (4<<2<<0):1 = 16:1 (not used in 'Hoshimi') #04 32:4 = 8 // (1<<2<<0):2 = 4:2 priority #13 #05 12:2 = 6 // (2<<2<<0):2 = 8:2 priority #09 #06 24:4 = 6 // (3<<2<<0):2 = 12:2 priority #05 #07 16:3 = 5.3 // (4<<2<<0):2 = 16:2 priority #03 #08 4:1 = 4 // (1<<2<<0):3 = 4:3 priority #15 #09 8:2 = 4 // (2<<2<<0):3 = 8:3 priority #12 #10 12:3 = 4 // (3<<2<<0):3 = 12:3 priority #10 #11 16:4 = 4 // (4<<2<<0):3 = 16:3 priority #07 #12 8:3 = 2.6 // (1<<2<<1):4 = 8:4 priority #14 (not used in 'Hoshimi*') #13 4:2 = 2 // (2<<2<<1):4 = 16:4 priority #11 #14 8:4 = 2 // (3<<2<<1):4 = 24:4 priority #06 #15 4:3 = 1.6 // (4<<2<<1):4 = 32:4 priority #04 // In 'Hoshimi' two bit combinations were unexploited, in 'Hoshimikou' one bit combination was unexploited, in 'Lexx' none is left. /* // Branchfull [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); if ( (DWORDtrio & 0x0F) == 0x0C ) { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(srcLOCAL+1) ), 16*2); // Hard lesson: XMM and YMM are not to be used together. #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(srcLOCAL+1) ), retLOCAL ); #endif retLOCAL+= (DWORDtrio>>4); srcLOCAL+= (DWORDtrio>>4)+1; } else { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), 16*2); #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), retLOCAL ); #endif srcLOCAL+= 1+(DWORDtrio&0x03); // 4|3|2|1 retLOCAL+= ((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2); // 4/8/12/16/24/32 } // Branchfull ] */ // Branchless [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); Flag=!((DWORDtrio & 0x0F)-0x0C); // In here Flag=0|1 FlagMASKnegated= Flag - 1; // -1|0 FlagMASK= ~FlagMASKnegated; #ifdef _N_YMM // SlowCopy256bit( (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), retLOCAL); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: SlowCopy256bit( (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), retLOCAL); #endif #ifdef _N_GP // memcpy(retLOCAL, (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), 16*2); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: memcpy(retLOCAL, (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), 16*2); #endif retLOCAL+= ((uint64_t)((DWORDtrio>>4))&FlagMASK) + ((uint64_t)(((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2))&FlagMASKnegated) ; srcLOCAL+= ((uint64_t)((DWORDtrio>>4)+1)&FlagMASK) + ((uint64_t)(1+(DWORDtrio&0x03))&FlagMASKnegated) ; // Branchless ] } //printf("\nloopcounter=%d\n",loopcounter); // loopcounter=29763921 for 'Autobiography_411-ebooks_Collection.tar.Lexx.Nakamichi' return (uint64_t)(retLOCAL - ret); } uint64_t Decompress005 (unsigned char* ret, unsigned char* src, uint64_t srcSize) { unsigned char* retLOCAL = ret; unsigned char* srcLOCAL = src; unsigned char* srcEndLOCAL = src+srcSize; unsigned int DWORDtrio; //unsigned int Flag; // This line made my hair white! For some reason it should be 64bit, otherwise it makes 'FlagMASKnegated' 32bit during 'FlagMASKnegated= Flag - 1; // -1|0' uint64_t Flag; uint64_t FlagMASK; //= 0xFFFFFFFFFFFFFFFF; uint64_t FlagMASKnegated; //=0x0000000000000000; //int loopcounter=0; while (srcLOCAL < srcEndLOCAL) { //loopcounter++; DWORDtrio = *(unsigned int*)srcLOCAL; //#ifndef _N_GP //#ifdef _N_prefetch_4096 // _mm_prefetch((char*)(srcLOCAL + 64*64), _MM_HINT_T0); //#endif //#endif // |1stLSB |2ndLSB |3rdLSB | // ------------------------------- // |OO|LL|xxxx|xxxxxxxx|xxxxxx|xx| // ------------------------------- // [1bit 16bit] 24bit] // OOLL = 0011 means Literal // OO = 00b MatchOffset, 0xFFFFFFFF>>(3-OO), 1 bytes long i.e. Sliding Window is 1*8-LL-OO=(1+OO)*8-4=04 or 16B // OO = 01b MatchOffset, 0xFFFFFFFF>>(3-OO), 2 bytes long i.e. Sliding Window is 2*8-LL-OO=(1+OO)*8-4=12 or 4KB // OO = 10b MatchOffset, 0xFFFFFFFF>>(3-OO), 3 bytes long i.e. Sliding Window is 3*8-LL-OO=(1+OO)*8-4=20 or 1MB // OO = 11b MatchOffset, 0xFFFFFFFF>>(3-OO), 4 bytes long i.e. Sliding Window is 4*8-LL-OO=(1+OO)*8-4=28 or 256MB // LL = 00b means 04/08/12 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 01b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 10b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 11b means 08/16/24/32 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // (1<<2<<0):1 = 4:1 priority #08 #01 12:1 = 12 // (2<<2<<0):1 = 8:1 priority #02 #02 8:1 = 8 // (3<<2<<0):1 = 12:1 priority #01 #03 16:2 = 8 // (4<<2<<0):1 = 16:1 (not used in 'Hoshimi') #04 32:4 = 8 // (1<<2<<0):2 = 4:2 priority #13 #05 12:2 = 6 // (2<<2<<0):2 = 8:2 priority #09 #06 24:4 = 6 // (3<<2<<0):2 = 12:2 priority #05 #07 16:3 = 5.3 // (4<<2<<0):2 = 16:2 priority #03 #08 4:1 = 4 // (1<<2<<0):3 = 4:3 priority #15 #09 8:2 = 4 // (2<<2<<0):3 = 8:3 priority #12 #10 12:3 = 4 // (3<<2<<0):3 = 12:3 priority #10 #11 16:4 = 4 // (4<<2<<0):3 = 16:3 priority #07 #12 8:3 = 2.6 // (1<<2<<1):4 = 8:4 priority #14 (not used in 'Hoshimi*') #13 4:2 = 2 // (2<<2<<1):4 = 16:4 priority #11 #14 8:4 = 2 // (3<<2<<1):4 = 24:4 priority #06 #15 4:3 = 1.6 // (4<<2<<1):4 = 32:4 priority #04 // In 'Hoshimi' two bit combinations were unexploited, in 'Hoshimikou' one bit combination was unexploited, in 'Lexx' none is left. /* // Branchfull [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); if ( (DWORDtrio & 0x0F) == 0x0C ) { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(srcLOCAL+1) ), 16*2); // Hard lesson: XMM and YMM are not to be used together. #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(srcLOCAL+1) ), retLOCAL ); #endif retLOCAL+= (DWORDtrio>>4); srcLOCAL+= (DWORDtrio>>4)+1; } else { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), 16*2); #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), retLOCAL ); #endif srcLOCAL+= 1+(DWORDtrio&0x03); // 4|3|2|1 retLOCAL+= ((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2); // 4/8/12/16/24/32 } // Branchfull ] */ // Branchless [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); Flag=!((DWORDtrio & 0x0F)-0x0C); // In here Flag=0|1 FlagMASKnegated= Flag - 1; // -1|0 FlagMASK= ~FlagMASKnegated; #ifdef _N_YMM // SlowCopy256bit( (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), retLOCAL); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: SlowCopy256bit( (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), retLOCAL); #endif #ifdef _N_GP // memcpy(retLOCAL, (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), 16*2); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: memcpy(retLOCAL, (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), 16*2); #endif retLOCAL+= ((uint64_t)((DWORDtrio>>4))&FlagMASK) + ((uint64_t)(((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2))&FlagMASKnegated) ; srcLOCAL+= ((uint64_t)((DWORDtrio>>4)+1)&FlagMASK) + ((uint64_t)(1+(DWORDtrio&0x03))&FlagMASKnegated) ; // Branchless ] } //printf("\nloopcounter=%d\n",loopcounter); // loopcounter=29763921 for 'Autobiography_411-ebooks_Collection.tar.Lexx.Nakamichi' return (uint64_t)(retLOCAL - ret); } uint64_t Decompress006 (unsigned char* ret, unsigned char* src, uint64_t srcSize) { unsigned char* retLOCAL = ret; unsigned char* srcLOCAL = src; unsigned char* srcEndLOCAL = src+srcSize; unsigned int DWORDtrio; //unsigned int Flag; // This line made my hair white! For some reason it should be 64bit, otherwise it makes 'FlagMASKnegated' 32bit during 'FlagMASKnegated= Flag - 1; // -1|0' uint64_t Flag; uint64_t FlagMASK; //= 0xFFFFFFFFFFFFFFFF; uint64_t FlagMASKnegated; //=0x0000000000000000; //int loopcounter=0; while (srcLOCAL < srcEndLOCAL) { //loopcounter++; DWORDtrio = *(unsigned int*)srcLOCAL; //#ifndef _N_GP //#ifdef _N_prefetch_4096 // _mm_prefetch((char*)(srcLOCAL + 64*64), _MM_HINT_T0); //#endif //#endif // |1stLSB |2ndLSB |3rdLSB | // ------------------------------- // |OO|LL|xxxx|xxxxxxxx|xxxxxx|xx| // ------------------------------- // [1bit 16bit] 24bit] // OOLL = 0011 means Literal // OO = 00b MatchOffset, 0xFFFFFFFF>>(3-OO), 1 bytes long i.e. Sliding Window is 1*8-LL-OO=(1+OO)*8-4=04 or 16B // OO = 01b MatchOffset, 0xFFFFFFFF>>(3-OO), 2 bytes long i.e. Sliding Window is 2*8-LL-OO=(1+OO)*8-4=12 or 4KB // OO = 10b MatchOffset, 0xFFFFFFFF>>(3-OO), 3 bytes long i.e. Sliding Window is 3*8-LL-OO=(1+OO)*8-4=20 or 1MB // OO = 11b MatchOffset, 0xFFFFFFFF>>(3-OO), 4 bytes long i.e. Sliding Window is 4*8-LL-OO=(1+OO)*8-4=28 or 256MB // LL = 00b means 04/08/12 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 01b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 10b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 11b means 08/16/24/32 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // (1<<2<<0):1 = 4:1 priority #08 #01 12:1 = 12 // (2<<2<<0):1 = 8:1 priority #02 #02 8:1 = 8 // (3<<2<<0):1 = 12:1 priority #01 #03 16:2 = 8 // (4<<2<<0):1 = 16:1 (not used in 'Hoshimi') #04 32:4 = 8 // (1<<2<<0):2 = 4:2 priority #13 #05 12:2 = 6 // (2<<2<<0):2 = 8:2 priority #09 #06 24:4 = 6 // (3<<2<<0):2 = 12:2 priority #05 #07 16:3 = 5.3 // (4<<2<<0):2 = 16:2 priority #03 #08 4:1 = 4 // (1<<2<<0):3 = 4:3 priority #15 #09 8:2 = 4 // (2<<2<<0):3 = 8:3 priority #12 #10 12:3 = 4 // (3<<2<<0):3 = 12:3 priority #10 #11 16:4 = 4 // (4<<2<<0):3 = 16:3 priority #07 #12 8:3 = 2.6 // (1<<2<<1):4 = 8:4 priority #14 (not used in 'Hoshimi*') #13 4:2 = 2 // (2<<2<<1):4 = 16:4 priority #11 #14 8:4 = 2 // (3<<2<<1):4 = 24:4 priority #06 #15 4:3 = 1.6 // (4<<2<<1):4 = 32:4 priority #04 // In 'Hoshimi' two bit combinations were unexploited, in 'Hoshimikou' one bit combination was unexploited, in 'Lexx' none is left. /* // Branchfull [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); if ( (DWORDtrio & 0x0F) == 0x0C ) { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(srcLOCAL+1) ), 16*2); // Hard lesson: XMM and YMM are not to be used together. #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(srcLOCAL+1) ), retLOCAL ); #endif retLOCAL+= (DWORDtrio>>4); srcLOCAL+= (DWORDtrio>>4)+1; } else { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), 16*2); #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), retLOCAL ); #endif srcLOCAL+= 1+(DWORDtrio&0x03); // 4|3|2|1 retLOCAL+= ((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2); // 4/8/12/16/24/32 } // Branchfull ] */ // Branchless [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); Flag=!((DWORDtrio & 0x0F)-0x0C); // In here Flag=0|1 FlagMASKnegated= Flag - 1; // -1|0 FlagMASK= ~FlagMASKnegated; #ifdef _N_YMM // SlowCopy256bit( (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), retLOCAL); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: SlowCopy256bit( (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), retLOCAL); #endif #ifdef _N_GP // memcpy(retLOCAL, (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), 16*2); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: memcpy(retLOCAL, (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), 16*2); #endif retLOCAL+= ((uint64_t)((DWORDtrio>>4))&FlagMASK) + ((uint64_t)(((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2))&FlagMASKnegated) ; srcLOCAL+= ((uint64_t)((DWORDtrio>>4)+1)&FlagMASK) + ((uint64_t)(1+(DWORDtrio&0x03))&FlagMASKnegated) ; // Branchless ] } //printf("\nloopcounter=%d\n",loopcounter); // loopcounter=29763921 for 'Autobiography_411-ebooks_Collection.tar.Lexx.Nakamichi' return (uint64_t)(retLOCAL - ret); } uint64_t Decompress007 (unsigned char* ret, unsigned char* src, uint64_t srcSize) { unsigned char* retLOCAL = ret; unsigned char* srcLOCAL = src; unsigned char* srcEndLOCAL = src+srcSize; unsigned int DWORDtrio; //unsigned int Flag; // This line made my hair white! For some reason it should be 64bit, otherwise it makes 'FlagMASKnegated' 32bit during 'FlagMASKnegated= Flag - 1; // -1|0' uint64_t Flag; uint64_t FlagMASK; //= 0xFFFFFFFFFFFFFFFF; uint64_t FlagMASKnegated; //=0x0000000000000000; //int loopcounter=0; while (srcLOCAL < srcEndLOCAL) { //loopcounter++; DWORDtrio = *(unsigned int*)srcLOCAL; //#ifndef _N_GP //#ifdef _N_prefetch_4096 // _mm_prefetch((char*)(srcLOCAL + 64*64), _MM_HINT_T0); //#endif //#endif // |1stLSB |2ndLSB |3rdLSB | // ------------------------------- // |OO|LL|xxxx|xxxxxxxx|xxxxxx|xx| // ------------------------------- // [1bit 16bit] 24bit] // OOLL = 0011 means Literal // OO = 00b MatchOffset, 0xFFFFFFFF>>(3-OO), 1 bytes long i.e. Sliding Window is 1*8-LL-OO=(1+OO)*8-4=04 or 16B // OO = 01b MatchOffset, 0xFFFFFFFF>>(3-OO), 2 bytes long i.e. Sliding Window is 2*8-LL-OO=(1+OO)*8-4=12 or 4KB // OO = 10b MatchOffset, 0xFFFFFFFF>>(3-OO), 3 bytes long i.e. Sliding Window is 3*8-LL-OO=(1+OO)*8-4=20 or 1MB // OO = 11b MatchOffset, 0xFFFFFFFF>>(3-OO), 4 bytes long i.e. Sliding Window is 4*8-LL-OO=(1+OO)*8-4=28 or 256MB // LL = 00b means 04/08/12 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 01b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 10b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 11b means 08/16/24/32 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // (1<<2<<0):1 = 4:1 priority #08 #01 12:1 = 12 // (2<<2<<0):1 = 8:1 priority #02 #02 8:1 = 8 // (3<<2<<0):1 = 12:1 priority #01 #03 16:2 = 8 // (4<<2<<0):1 = 16:1 (not used in 'Hoshimi') #04 32:4 = 8 // (1<<2<<0):2 = 4:2 priority #13 #05 12:2 = 6 // (2<<2<<0):2 = 8:2 priority #09 #06 24:4 = 6 // (3<<2<<0):2 = 12:2 priority #05 #07 16:3 = 5.3 // (4<<2<<0):2 = 16:2 priority #03 #08 4:1 = 4 // (1<<2<<0):3 = 4:3 priority #15 #09 8:2 = 4 // (2<<2<<0):3 = 8:3 priority #12 #10 12:3 = 4 // (3<<2<<0):3 = 12:3 priority #10 #11 16:4 = 4 // (4<<2<<0):3 = 16:3 priority #07 #12 8:3 = 2.6 // (1<<2<<1):4 = 8:4 priority #14 (not used in 'Hoshimi*') #13 4:2 = 2 // (2<<2<<1):4 = 16:4 priority #11 #14 8:4 = 2 // (3<<2<<1):4 = 24:4 priority #06 #15 4:3 = 1.6 // (4<<2<<1):4 = 32:4 priority #04 // In 'Hoshimi' two bit combinations were unexploited, in 'Hoshimikou' one bit combination was unexploited, in 'Lexx' none is left. /* // Branchfull [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); if ( (DWORDtrio & 0x0F) == 0x0C ) { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(srcLOCAL+1) ), 16*2); // Hard lesson: XMM and YMM are not to be used together. #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(srcLOCAL+1) ), retLOCAL ); #endif retLOCAL+= (DWORDtrio>>4); srcLOCAL+= (DWORDtrio>>4)+1; } else { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), 16*2); #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), retLOCAL ); #endif srcLOCAL+= 1+(DWORDtrio&0x03); // 4|3|2|1 retLOCAL+= ((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2); // 4/8/12/16/24/32 } // Branchfull ] */ // Branchless [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); Flag=!((DWORDtrio & 0x0F)-0x0C); // In here Flag=0|1 FlagMASKnegated= Flag - 1; // -1|0 FlagMASK= ~FlagMASKnegated; #ifdef _N_YMM // SlowCopy256bit( (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), retLOCAL); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: SlowCopy256bit( (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), retLOCAL); #endif #ifdef _N_GP // memcpy(retLOCAL, (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), 16*2); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: memcpy(retLOCAL, (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), 16*2); #endif retLOCAL+= ((uint64_t)((DWORDtrio>>4))&FlagMASK) + ((uint64_t)(((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2))&FlagMASKnegated) ; srcLOCAL+= ((uint64_t)((DWORDtrio>>4)+1)&FlagMASK) + ((uint64_t)(1+(DWORDtrio&0x03))&FlagMASKnegated) ; // Branchless ] } //printf("\nloopcounter=%d\n",loopcounter); // loopcounter=29763921 for 'Autobiography_411-ebooks_Collection.tar.Lexx.Nakamichi' return (uint64_t)(retLOCAL - ret); } uint64_t Decompress008 (unsigned char* ret, unsigned char* src, uint64_t srcSize) { unsigned char* retLOCAL = ret; unsigned char* srcLOCAL = src; unsigned char* srcEndLOCAL = src+srcSize; unsigned int DWORDtrio; //unsigned int Flag; // This line made my hair white! For some reason it should be 64bit, otherwise it makes 'FlagMASKnegated' 32bit during 'FlagMASKnegated= Flag - 1; // -1|0' uint64_t Flag; uint64_t FlagMASK; //= 0xFFFFFFFFFFFFFFFF; uint64_t FlagMASKnegated; //=0x0000000000000000; //int loopcounter=0; while (srcLOCAL < srcEndLOCAL) { //loopcounter++; DWORDtrio = *(unsigned int*)srcLOCAL; //#ifndef _N_GP //#ifdef _N_prefetch_4096 // _mm_prefetch((char*)(srcLOCAL + 64*64), _MM_HINT_T0); //#endif //#endif // |1stLSB |2ndLSB |3rdLSB | // ------------------------------- // |OO|LL|xxxx|xxxxxxxx|xxxxxx|xx| // ------------------------------- // [1bit 16bit] 24bit] // OOLL = 0011 means Literal // OO = 00b MatchOffset, 0xFFFFFFFF>>(3-OO), 1 bytes long i.e. Sliding Window is 1*8-LL-OO=(1+OO)*8-4=04 or 16B // OO = 01b MatchOffset, 0xFFFFFFFF>>(3-OO), 2 bytes long i.e. Sliding Window is 2*8-LL-OO=(1+OO)*8-4=12 or 4KB // OO = 10b MatchOffset, 0xFFFFFFFF>>(3-OO), 3 bytes long i.e. Sliding Window is 3*8-LL-OO=(1+OO)*8-4=20 or 1MB // OO = 11b MatchOffset, 0xFFFFFFFF>>(3-OO), 4 bytes long i.e. Sliding Window is 4*8-LL-OO=(1+OO)*8-4=28 or 256MB // LL = 00b means 04/08/12 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 01b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 10b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 11b means 08/16/24/32 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // (1<<2<<0):1 = 4:1 priority #08 #01 12:1 = 12 // (2<<2<<0):1 = 8:1 priority #02 #02 8:1 = 8 // (3<<2<<0):1 = 12:1 priority #01 #03 16:2 = 8 // (4<<2<<0):1 = 16:1 (not used in 'Hoshimi') #04 32:4 = 8 // (1<<2<<0):2 = 4:2 priority #13 #05 12:2 = 6 // (2<<2<<0):2 = 8:2 priority #09 #06 24:4 = 6 // (3<<2<<0):2 = 12:2 priority #05 #07 16:3 = 5.3 // (4<<2<<0):2 = 16:2 priority #03 #08 4:1 = 4 // (1<<2<<0):3 = 4:3 priority #15 #09 8:2 = 4 // (2<<2<<0):3 = 8:3 priority #12 #10 12:3 = 4 // (3<<2<<0):3 = 12:3 priority #10 #11 16:4 = 4 // (4<<2<<0):3 = 16:3 priority #07 #12 8:3 = 2.6 // (1<<2<<1):4 = 8:4 priority #14 (not used in 'Hoshimi*') #13 4:2 = 2 // (2<<2<<1):4 = 16:4 priority #11 #14 8:4 = 2 // (3<<2<<1):4 = 24:4 priority #06 #15 4:3 = 1.6 // (4<<2<<1):4 = 32:4 priority #04 // In 'Hoshimi' two bit combinations were unexploited, in 'Hoshimikou' one bit combination was unexploited, in 'Lexx' none is left. /* // Branchfull [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); if ( (DWORDtrio & 0x0F) == 0x0C ) { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(srcLOCAL+1) ), 16*2); // Hard lesson: XMM and YMM are not to be used together. #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(srcLOCAL+1) ), retLOCAL ); #endif retLOCAL+= (DWORDtrio>>4); srcLOCAL+= (DWORDtrio>>4)+1; } else { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), 16*2); #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), retLOCAL ); #endif srcLOCAL+= 1+(DWORDtrio&0x03); // 4|3|2|1 retLOCAL+= ((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2); // 4/8/12/16/24/32 } // Branchfull ] */ // Branchless [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); Flag=!((DWORDtrio & 0x0F)-0x0C); // In here Flag=0|1 FlagMASKnegated= Flag - 1; // -1|0 FlagMASK= ~FlagMASKnegated; #ifdef _N_YMM // SlowCopy256bit( (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), retLOCAL); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: SlowCopy256bit( (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), retLOCAL); #endif #ifdef _N_GP // memcpy(retLOCAL, (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), 16*2); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: //debug [ /* if ((uint64_t)retLOCAL>0x100000000) printf("(const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ): %p\n",(const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) )); if ((uint64_t)retLOCAL>0x100000000) printf("(const char *)( ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)) ): %p\n",(const char *)( ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)) )); if ((uint64_t)retLOCAL>0x100000000) printf("(const char *)( ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1))&FlagMASKnegated ): %p\n",(const char *)( ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1))&FlagMASKnegated )); if ((uint64_t)retLOCAL>0x100000000) printf("((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated): %s\n", _ui64toaKAZEcomma((uint64_t)( ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), llTOaDigits, 10)); if ((uint64_t)retLOCAL>0x100000000) printf("(uint64_t)(-1): %s\n", _ui64toaKAZEcomma((uint64_t)(-1), llTOaDigits, 10)); if ((uint64_t)retLOCAL>0x100000000) printf("(uint64_t)FlagMASKnegated: %s\n", _ui64toaKAZEcomma((uint64_t)FlagMASKnegated, llTOaDigits, 10)); if ((uint64_t)retLOCAL>0x100000000) printf("SOURCE: %s\n", _ui64toaKAZEcomma((uint64_t)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), llTOaDigits, 10)); if ((uint64_t)retLOCAL>0x100000000) printf("retLOCAL: %p\n",retLOCAL); if ((uint64_t)retLOCAL>0x100000000) printf("retLOCAL: %s\n", _ui64toaKAZEcomma((uint64_t)retLOCAL, llTOaDigits, 10)); if ((uint64_t)retLOCAL>0x100000000) printf("FlagMASK, FlagMASKnegated: %d, %d\n", FlagMASK,FlagMASKnegated); if ((uint64_t)retLOCAL>0x100000000) printf("\n"); */ /* (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ): 00000000F21664A7 (const char *)( ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)) ): 00000000F21664A6 (const char *)( ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1))&FlagMASKnegated ): 00000000F21664A6 ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated): 4,061,553,830 (uint64_t)(-1): 18,446,744,073,709,551,615 (uint64_t)FlagMASKnegated: 4,294,967,295 SOURCE: 4,061,553,831 retLOCAL: 000000010000017B retLOCAL: 4,294,967,675 FlagMASK, FlagMASKnegated: 0, -1 (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ): 00000000FFFC1763 (const char *)( ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)) ): 00000000FFFC1762 (const char *)( ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1))&FlagMASKnegated ): 00000000FFFC1762 ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated): 4,294,711,138 (uint64_t)(-1): 18,446,744,073,709,551,615 (uint64_t)FlagMASKnegated: 4,294,967,295 SOURCE: 4,294,711,139 retLOCAL: 0000000100000183 retLOCAL: 4,294,967,683 FlagMASK, FlagMASKnegated: 0, -1 (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ): 00000000000000CA (const char *)( ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)) ): 00000001000000C9 (const char *)( ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1))&FlagMASKnegated ): 00000000000000C9 ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated): 201 (uint64_t)(-1): 18,446,744,073,709,551,615 (uint64_t)FlagMASKnegated: 4,294,967,295 SOURCE: 202 retLOCAL: 000000010000018B retLOCAL: 4,294,967,691 FlagMASK, FlagMASKnegated: 0, -1 Exit code: -1073741819 */ //debug ] memcpy(retLOCAL, (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), 16*2); //ORIGINAL #endif retLOCAL+= ((uint64_t)((DWORDtrio>>4))&FlagMASK) + ((uint64_t)(((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2))&FlagMASKnegated) ; srcLOCAL+= ((uint64_t)((DWORDtrio>>4)+1)&FlagMASK) + ((uint64_t)(1+(DWORDtrio&0x03))&FlagMASKnegated) ; // Branchless ] } //printf("\nloopcounter=%d\n",loopcounter); // loopcounter=29763921 for 'Autobiography_411-ebooks_Collection.tar.Lexx.Nakamichi' return (uint64_t)(retLOCAL - ret); } uint64_t Decompress009 (unsigned char* ret, unsigned char* src, uint64_t srcSize) { unsigned char* retLOCAL = ret; unsigned char* srcLOCAL = src; unsigned char* srcEndLOCAL = src+srcSize; unsigned int DWORDtrio; //unsigned int Flag; // This line made my hair white! For some reason it should be 64bit, otherwise it makes 'FlagMASKnegated' 32bit during 'FlagMASKnegated= Flag - 1; // -1|0' uint64_t Flag; uint64_t FlagMASK; //= 0xFFFFFFFFFFFFFFFF; uint64_t FlagMASKnegated; //=0x0000000000000000; //int loopcounter=0; while (srcLOCAL < srcEndLOCAL) { //loopcounter++; DWORDtrio = *(unsigned int*)srcLOCAL; //#ifndef _N_GP //#ifdef _N_prefetch_4096 // _mm_prefetch((char*)(srcLOCAL + 64*64), _MM_HINT_T0); //#endif //#endif // |1stLSB |2ndLSB |3rdLSB | // ------------------------------- // |OO|LL|xxxx|xxxxxxxx|xxxxxx|xx| // ------------------------------- // [1bit 16bit] 24bit] // OOLL = 0011 means Literal // OO = 00b MatchOffset, 0xFFFFFFFF>>(3-OO), 1 bytes long i.e. Sliding Window is 1*8-LL-OO=(1+OO)*8-4=04 or 16B // OO = 01b MatchOffset, 0xFFFFFFFF>>(3-OO), 2 bytes long i.e. Sliding Window is 2*8-LL-OO=(1+OO)*8-4=12 or 4KB // OO = 10b MatchOffset, 0xFFFFFFFF>>(3-OO), 3 bytes long i.e. Sliding Window is 3*8-LL-OO=(1+OO)*8-4=20 or 1MB // OO = 11b MatchOffset, 0xFFFFFFFF>>(3-OO), 4 bytes long i.e. Sliding Window is 4*8-LL-OO=(1+OO)*8-4=28 or 256MB // LL = 00b means 04/08/12 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 01b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 10b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 11b means 08/16/24/32 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // (1<<2<<0):1 = 4:1 priority #08 #01 12:1 = 12 // (2<<2<<0):1 = 8:1 priority #02 #02 8:1 = 8 // (3<<2<<0):1 = 12:1 priority #01 #03 16:2 = 8 // (4<<2<<0):1 = 16:1 (not used in 'Hoshimi') #04 32:4 = 8 // (1<<2<<0):2 = 4:2 priority #13 #05 12:2 = 6 // (2<<2<<0):2 = 8:2 priority #09 #06 24:4 = 6 // (3<<2<<0):2 = 12:2 priority #05 #07 16:3 = 5.3 // (4<<2<<0):2 = 16:2 priority #03 #08 4:1 = 4 // (1<<2<<0):3 = 4:3 priority #15 #09 8:2 = 4 // (2<<2<<0):3 = 8:3 priority #12 #10 12:3 = 4 // (3<<2<<0):3 = 12:3 priority #10 #11 16:4 = 4 // (4<<2<<0):3 = 16:3 priority #07 #12 8:3 = 2.6 // (1<<2<<1):4 = 8:4 priority #14 (not used in 'Hoshimi*') #13 4:2 = 2 // (2<<2<<1):4 = 16:4 priority #11 #14 8:4 = 2 // (3<<2<<1):4 = 24:4 priority #06 #15 4:3 = 1.6 // (4<<2<<1):4 = 32:4 priority #04 // In 'Hoshimi' two bit combinations were unexploited, in 'Hoshimikou' one bit combination was unexploited, in 'Lexx' none is left. /* // Branchfull [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); if ( (DWORDtrio & 0x0F) == 0x0C ) { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(srcLOCAL+1) ), 16*2); // Hard lesson: XMM and YMM are not to be used together. #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(srcLOCAL+1) ), retLOCAL ); #endif retLOCAL+= (DWORDtrio>>4); srcLOCAL+= (DWORDtrio>>4)+1; } else { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), 16*2); #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), retLOCAL ); #endif srcLOCAL+= 1+(DWORDtrio&0x03); // 4|3|2|1 retLOCAL+= ((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2); // 4/8/12/16/24/32 } // Branchfull ] */ // Branchless [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); Flag=!((DWORDtrio & 0x0F)-0x0C); // In here Flag=0|1 FlagMASKnegated= Flag - 1; // -1|0 FlagMASK= ~FlagMASKnegated; #ifdef _N_YMM // SlowCopy256bit( (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), retLOCAL); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: SlowCopy256bit( (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), retLOCAL); #endif #ifdef _N_GP // memcpy(retLOCAL, (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), 16*2); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: memcpy(retLOCAL, (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), 16*2); #endif retLOCAL+= ((uint64_t)((DWORDtrio>>4))&FlagMASK) + ((uint64_t)(((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2))&FlagMASKnegated) ; srcLOCAL+= ((uint64_t)((DWORDtrio>>4)+1)&FlagMASK) + ((uint64_t)(1+(DWORDtrio&0x03))&FlagMASKnegated) ; // Branchless ] } //printf("\nloopcounter=%d\n",loopcounter); // loopcounter=29763921 for 'Autobiography_411-ebooks_Collection.tar.Lexx.Nakamichi' return (uint64_t)(retLOCAL - ret); } uint64_t Decompress010 (unsigned char* ret, unsigned char* src, uint64_t srcSize) { unsigned char* retLOCAL = ret; unsigned char* srcLOCAL = src; unsigned char* srcEndLOCAL = src+srcSize; unsigned int DWORDtrio; //unsigned int Flag; // This line made my hair white! For some reason it should be 64bit, otherwise it makes 'FlagMASKnegated' 32bit during 'FlagMASKnegated= Flag - 1; // -1|0' uint64_t Flag; uint64_t FlagMASK; //= 0xFFFFFFFFFFFFFFFF; uint64_t FlagMASKnegated; //=0x0000000000000000; //int loopcounter=0; while (srcLOCAL < srcEndLOCAL) { //loopcounter++; DWORDtrio = *(unsigned int*)srcLOCAL; //#ifndef _N_GP //#ifdef _N_prefetch_4096 // _mm_prefetch((char*)(srcLOCAL + 64*64), _MM_HINT_T0); //#endif //#endif // |1stLSB |2ndLSB |3rdLSB | // ------------------------------- // |OO|LL|xxxx|xxxxxxxx|xxxxxx|xx| // ------------------------------- // [1bit 16bit] 24bit] // OOLL = 0011 means Literal // OO = 00b MatchOffset, 0xFFFFFFFF>>(3-OO), 1 bytes long i.e. Sliding Window is 1*8-LL-OO=(1+OO)*8-4=04 or 16B // OO = 01b MatchOffset, 0xFFFFFFFF>>(3-OO), 2 bytes long i.e. Sliding Window is 2*8-LL-OO=(1+OO)*8-4=12 or 4KB // OO = 10b MatchOffset, 0xFFFFFFFF>>(3-OO), 3 bytes long i.e. Sliding Window is 3*8-LL-OO=(1+OO)*8-4=20 or 1MB // OO = 11b MatchOffset, 0xFFFFFFFF>>(3-OO), 4 bytes long i.e. Sliding Window is 4*8-LL-OO=(1+OO)*8-4=28 or 256MB // LL = 00b means 04/08/12 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 01b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 10b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 11b means 08/16/24/32 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // (1<<2<<0):1 = 4:1 priority #08 #01 12:1 = 12 // (2<<2<<0):1 = 8:1 priority #02 #02 8:1 = 8 // (3<<2<<0):1 = 12:1 priority #01 #03 16:2 = 8 // (4<<2<<0):1 = 16:1 (not used in 'Hoshimi') #04 32:4 = 8 // (1<<2<<0):2 = 4:2 priority #13 #05 12:2 = 6 // (2<<2<<0):2 = 8:2 priority #09 #06 24:4 = 6 // (3<<2<<0):2 = 12:2 priority #05 #07 16:3 = 5.3 // (4<<2<<0):2 = 16:2 priority #03 #08 4:1 = 4 // (1<<2<<0):3 = 4:3 priority #15 #09 8:2 = 4 // (2<<2<<0):3 = 8:3 priority #12 #10 12:3 = 4 // (3<<2<<0):3 = 12:3 priority #10 #11 16:4 = 4 // (4<<2<<0):3 = 16:3 priority #07 #12 8:3 = 2.6 // (1<<2<<1):4 = 8:4 priority #14 (not used in 'Hoshimi*') #13 4:2 = 2 // (2<<2<<1):4 = 16:4 priority #11 #14 8:4 = 2 // (3<<2<<1):4 = 24:4 priority #06 #15 4:3 = 1.6 // (4<<2<<1):4 = 32:4 priority #04 // In 'Hoshimi' two bit combinations were unexploited, in 'Hoshimikou' one bit combination was unexploited, in 'Lexx' none is left. /* // Branchfull [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); if ( (DWORDtrio & 0x0F) == 0x0C ) { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(srcLOCAL+1) ), 16*2); // Hard lesson: XMM and YMM are not to be used together. #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(srcLOCAL+1) ), retLOCAL ); #endif retLOCAL+= (DWORDtrio>>4); srcLOCAL+= (DWORDtrio>>4)+1; } else { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), 16*2); #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), retLOCAL ); #endif srcLOCAL+= 1+(DWORDtrio&0x03); // 4|3|2|1 retLOCAL+= ((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2); // 4/8/12/16/24/32 } // Branchfull ] */ // Branchless [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); Flag=!((DWORDtrio & 0x0F)-0x0C); // In here Flag=0|1 FlagMASKnegated= Flag - 1; // -1|0 FlagMASK= ~FlagMASKnegated; #ifdef _N_YMM // SlowCopy256bit( (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), retLOCAL); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: SlowCopy256bit( (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), retLOCAL); #endif #ifdef _N_GP // memcpy(retLOCAL, (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), 16*2); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: memcpy(retLOCAL, (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), 16*2); #endif retLOCAL+= ((uint64_t)((DWORDtrio>>4))&FlagMASK) + ((uint64_t)(((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2))&FlagMASKnegated) ; srcLOCAL+= ((uint64_t)((DWORDtrio>>4)+1)&FlagMASK) + ((uint64_t)(1+(DWORDtrio&0x03))&FlagMASKnegated) ; // Branchless ] } //printf("\nloopcounter=%d\n",loopcounter); // loopcounter=29763921 for 'Autobiography_411-ebooks_Collection.tar.Lexx.Nakamichi' return (uint64_t)(retLOCAL - ret); } uint64_t Decompress011 (unsigned char* ret, unsigned char* src, uint64_t srcSize) { unsigned char* retLOCAL = ret; unsigned char* srcLOCAL = src; unsigned char* srcEndLOCAL = src+srcSize; unsigned int DWORDtrio; //unsigned int Flag; // This line made my hair white! For some reason it should be 64bit, otherwise it makes 'FlagMASKnegated' 32bit during 'FlagMASKnegated= Flag - 1; // -1|0' uint64_t Flag; uint64_t FlagMASK; //= 0xFFFFFFFFFFFFFFFF; uint64_t FlagMASKnegated; //=0x0000000000000000; //int loopcounter=0; while (srcLOCAL < srcEndLOCAL) { //loopcounter++; DWORDtrio = *(unsigned int*)srcLOCAL; //#ifndef _N_GP //#ifdef _N_prefetch_4096 // _mm_prefetch((char*)(srcLOCAL + 64*64), _MM_HINT_T0); //#endif //#endif // |1stLSB |2ndLSB |3rdLSB | // ------------------------------- // |OO|LL|xxxx|xxxxxxxx|xxxxxx|xx| // ------------------------------- // [1bit 16bit] 24bit] // OOLL = 0011 means Literal // OO = 00b MatchOffset, 0xFFFFFFFF>>(3-OO), 1 bytes long i.e. Sliding Window is 1*8-LL-OO=(1+OO)*8-4=04 or 16B // OO = 01b MatchOffset, 0xFFFFFFFF>>(3-OO), 2 bytes long i.e. Sliding Window is 2*8-LL-OO=(1+OO)*8-4=12 or 4KB // OO = 10b MatchOffset, 0xFFFFFFFF>>(3-OO), 3 bytes long i.e. Sliding Window is 3*8-LL-OO=(1+OO)*8-4=20 or 1MB // OO = 11b MatchOffset, 0xFFFFFFFF>>(3-OO), 4 bytes long i.e. Sliding Window is 4*8-LL-OO=(1+OO)*8-4=28 or 256MB // LL = 00b means 04/08/12 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 01b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 10b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 11b means 08/16/24/32 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // (1<<2<<0):1 = 4:1 priority #08 #01 12:1 = 12 // (2<<2<<0):1 = 8:1 priority #02 #02 8:1 = 8 // (3<<2<<0):1 = 12:1 priority #01 #03 16:2 = 8 // (4<<2<<0):1 = 16:1 (not used in 'Hoshimi') #04 32:4 = 8 // (1<<2<<0):2 = 4:2 priority #13 #05 12:2 = 6 // (2<<2<<0):2 = 8:2 priority #09 #06 24:4 = 6 // (3<<2<<0):2 = 12:2 priority #05 #07 16:3 = 5.3 // (4<<2<<0):2 = 16:2 priority #03 #08 4:1 = 4 // (1<<2<<0):3 = 4:3 priority #15 #09 8:2 = 4 // (2<<2<<0):3 = 8:3 priority #12 #10 12:3 = 4 // (3<<2<<0):3 = 12:3 priority #10 #11 16:4 = 4 // (4<<2<<0):3 = 16:3 priority #07 #12 8:3 = 2.6 // (1<<2<<1):4 = 8:4 priority #14 (not used in 'Hoshimi*') #13 4:2 = 2 // (2<<2<<1):4 = 16:4 priority #11 #14 8:4 = 2 // (3<<2<<1):4 = 24:4 priority #06 #15 4:3 = 1.6 // (4<<2<<1):4 = 32:4 priority #04 // In 'Hoshimi' two bit combinations were unexploited, in 'Hoshimikou' one bit combination was unexploited, in 'Lexx' none is left. /* // Branchfull [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); if ( (DWORDtrio & 0x0F) == 0x0C ) { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(srcLOCAL+1) ), 16*2); // Hard lesson: XMM and YMM are not to be used together. #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(srcLOCAL+1) ), retLOCAL ); #endif retLOCAL+= (DWORDtrio>>4); srcLOCAL+= (DWORDtrio>>4)+1; } else { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), 16*2); #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), retLOCAL ); #endif srcLOCAL+= 1+(DWORDtrio&0x03); // 4|3|2|1 retLOCAL+= ((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2); // 4/8/12/16/24/32 } // Branchfull ] */ // Branchless [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); Flag=!((DWORDtrio & 0x0F)-0x0C); // In here Flag=0|1 FlagMASKnegated= Flag - 1; // -1|0 FlagMASK= ~FlagMASKnegated; #ifdef _N_YMM // SlowCopy256bit( (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), retLOCAL); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: SlowCopy256bit( (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), retLOCAL); #endif #ifdef _N_GP // memcpy(retLOCAL, (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), 16*2); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: memcpy(retLOCAL, (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), 16*2); #endif retLOCAL+= ((uint64_t)((DWORDtrio>>4))&FlagMASK) + ((uint64_t)(((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2))&FlagMASKnegated) ; srcLOCAL+= ((uint64_t)((DWORDtrio>>4)+1)&FlagMASK) + ((uint64_t)(1+(DWORDtrio&0x03))&FlagMASKnegated) ; // Branchless ] } //printf("\nloopcounter=%d\n",loopcounter); // loopcounter=29763921 for 'Autobiography_411-ebooks_Collection.tar.Lexx.Nakamichi' return (uint64_t)(retLOCAL - ret); } uint64_t Decompress012 (unsigned char* ret, unsigned char* src, uint64_t srcSize) { unsigned char* retLOCAL = ret; unsigned char* srcLOCAL = src; unsigned char* srcEndLOCAL = src+srcSize; unsigned int DWORDtrio; //unsigned int Flag; // This line made my hair white! For some reason it should be 64bit, otherwise it makes 'FlagMASKnegated' 32bit during 'FlagMASKnegated= Flag - 1; // -1|0' uint64_t Flag; uint64_t FlagMASK; //= 0xFFFFFFFFFFFFFFFF; uint64_t FlagMASKnegated; //=0x0000000000000000; //int loopcounter=0; while (srcLOCAL < srcEndLOCAL) { //loopcounter++; DWORDtrio = *(unsigned int*)srcLOCAL; //#ifndef _N_GP //#ifdef _N_prefetch_4096 // _mm_prefetch((char*)(srcLOCAL + 64*64), _MM_HINT_T0); //#endif //#endif // |1stLSB |2ndLSB |3rdLSB | // ------------------------------- // |OO|LL|xxxx|xxxxxxxx|xxxxxx|xx| // ------------------------------- // [1bit 16bit] 24bit] // OOLL = 0011 means Literal // OO = 00b MatchOffset, 0xFFFFFFFF>>(3-OO), 1 bytes long i.e. Sliding Window is 1*8-LL-OO=(1+OO)*8-4=04 or 16B // OO = 01b MatchOffset, 0xFFFFFFFF>>(3-OO), 2 bytes long i.e. Sliding Window is 2*8-LL-OO=(1+OO)*8-4=12 or 4KB // OO = 10b MatchOffset, 0xFFFFFFFF>>(3-OO), 3 bytes long i.e. Sliding Window is 3*8-LL-OO=(1+OO)*8-4=20 or 1MB // OO = 11b MatchOffset, 0xFFFFFFFF>>(3-OO), 4 bytes long i.e. Sliding Window is 4*8-LL-OO=(1+OO)*8-4=28 or 256MB // LL = 00b means 04/08/12 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 01b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 10b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 11b means 08/16/24/32 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // (1<<2<<0):1 = 4:1 priority #08 #01 12:1 = 12 // (2<<2<<0):1 = 8:1 priority #02 #02 8:1 = 8 // (3<<2<<0):1 = 12:1 priority #01 #03 16:2 = 8 // (4<<2<<0):1 = 16:1 (not used in 'Hoshimi') #04 32:4 = 8 // (1<<2<<0):2 = 4:2 priority #13 #05 12:2 = 6 // (2<<2<<0):2 = 8:2 priority #09 #06 24:4 = 6 // (3<<2<<0):2 = 12:2 priority #05 #07 16:3 = 5.3 // (4<<2<<0):2 = 16:2 priority #03 #08 4:1 = 4 // (1<<2<<0):3 = 4:3 priority #15 #09 8:2 = 4 // (2<<2<<0):3 = 8:3 priority #12 #10 12:3 = 4 // (3<<2<<0):3 = 12:3 priority #10 #11 16:4 = 4 // (4<<2<<0):3 = 16:3 priority #07 #12 8:3 = 2.6 // (1<<2<<1):4 = 8:4 priority #14 (not used in 'Hoshimi*') #13 4:2 = 2 // (2<<2<<1):4 = 16:4 priority #11 #14 8:4 = 2 // (3<<2<<1):4 = 24:4 priority #06 #15 4:3 = 1.6 // (4<<2<<1):4 = 32:4 priority #04 // In 'Hoshimi' two bit combinations were unexploited, in 'Hoshimikou' one bit combination was unexploited, in 'Lexx' none is left. /* // Branchfull [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); if ( (DWORDtrio & 0x0F) == 0x0C ) { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(srcLOCAL+1) ), 16*2); // Hard lesson: XMM and YMM are not to be used together. #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(srcLOCAL+1) ), retLOCAL ); #endif retLOCAL+= (DWORDtrio>>4); srcLOCAL+= (DWORDtrio>>4)+1; } else { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), 16*2); #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), retLOCAL ); #endif srcLOCAL+= 1+(DWORDtrio&0x03); // 4|3|2|1 retLOCAL+= ((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2); // 4/8/12/16/24/32 } // Branchfull ] */ // Branchless [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); Flag=!((DWORDtrio & 0x0F)-0x0C); // In here Flag=0|1 FlagMASKnegated= Flag - 1; // -1|0 FlagMASK= ~FlagMASKnegated; #ifdef _N_YMM // SlowCopy256bit( (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), retLOCAL); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: SlowCopy256bit( (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), retLOCAL); #endif #ifdef _N_GP // memcpy(retLOCAL, (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), 16*2); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: memcpy(retLOCAL, (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), 16*2); #endif retLOCAL+= ((uint64_t)((DWORDtrio>>4))&FlagMASK) + ((uint64_t)(((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2))&FlagMASKnegated) ; srcLOCAL+= ((uint64_t)((DWORDtrio>>4)+1)&FlagMASK) + ((uint64_t)(1+(DWORDtrio&0x03))&FlagMASKnegated) ; // Branchless ] } //printf("\nloopcounter=%d\n",loopcounter); // loopcounter=29763921 for 'Autobiography_411-ebooks_Collection.tar.Lexx.Nakamichi' return (uint64_t)(retLOCAL - ret); } uint64_t Decompress013 (unsigned char* ret, unsigned char* src, uint64_t srcSize) { unsigned char* retLOCAL = ret; unsigned char* srcLOCAL = src; unsigned char* srcEndLOCAL = src+srcSize; unsigned int DWORDtrio; //unsigned int Flag; // This line made my hair white! For some reason it should be 64bit, otherwise it makes 'FlagMASKnegated' 32bit during 'FlagMASKnegated= Flag - 1; // -1|0' uint64_t Flag; uint64_t FlagMASK; //= 0xFFFFFFFFFFFFFFFF; uint64_t FlagMASKnegated; //=0x0000000000000000; //int loopcounter=0; while (srcLOCAL < srcEndLOCAL) { //loopcounter++; DWORDtrio = *(unsigned int*)srcLOCAL; //#ifndef _N_GP //#ifdef _N_prefetch_4096 // _mm_prefetch((char*)(srcLOCAL + 64*64), _MM_HINT_T0); //#endif //#endif // |1stLSB |2ndLSB |3rdLSB | // ------------------------------- // |OO|LL|xxxx|xxxxxxxx|xxxxxx|xx| // ------------------------------- // [1bit 16bit] 24bit] // OOLL = 0011 means Literal // OO = 00b MatchOffset, 0xFFFFFFFF>>(3-OO), 1 bytes long i.e. Sliding Window is 1*8-LL-OO=(1+OO)*8-4=04 or 16B // OO = 01b MatchOffset, 0xFFFFFFFF>>(3-OO), 2 bytes long i.e. Sliding Window is 2*8-LL-OO=(1+OO)*8-4=12 or 4KB // OO = 10b MatchOffset, 0xFFFFFFFF>>(3-OO), 3 bytes long i.e. Sliding Window is 3*8-LL-OO=(1+OO)*8-4=20 or 1MB // OO = 11b MatchOffset, 0xFFFFFFFF>>(3-OO), 4 bytes long i.e. Sliding Window is 4*8-LL-OO=(1+OO)*8-4=28 or 256MB // LL = 00b means 04/08/12 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 01b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 10b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 11b means 08/16/24/32 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // (1<<2<<0):1 = 4:1 priority #08 #01 12:1 = 12 // (2<<2<<0):1 = 8:1 priority #02 #02 8:1 = 8 // (3<<2<<0):1 = 12:1 priority #01 #03 16:2 = 8 // (4<<2<<0):1 = 16:1 (not used in 'Hoshimi') #04 32:4 = 8 // (1<<2<<0):2 = 4:2 priority #13 #05 12:2 = 6 // (2<<2<<0):2 = 8:2 priority #09 #06 24:4 = 6 // (3<<2<<0):2 = 12:2 priority #05 #07 16:3 = 5.3 // (4<<2<<0):2 = 16:2 priority #03 #08 4:1 = 4 // (1<<2<<0):3 = 4:3 priority #15 #09 8:2 = 4 // (2<<2<<0):3 = 8:3 priority #12 #10 12:3 = 4 // (3<<2<<0):3 = 12:3 priority #10 #11 16:4 = 4 // (4<<2<<0):3 = 16:3 priority #07 #12 8:3 = 2.6 // (1<<2<<1):4 = 8:4 priority #14 (not used in 'Hoshimi*') #13 4:2 = 2 // (2<<2<<1):4 = 16:4 priority #11 #14 8:4 = 2 // (3<<2<<1):4 = 24:4 priority #06 #15 4:3 = 1.6 // (4<<2<<1):4 = 32:4 priority #04 // In 'Hoshimi' two bit combinations were unexploited, in 'Hoshimikou' one bit combination was unexploited, in 'Lexx' none is left. /* // Branchfull [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); if ( (DWORDtrio & 0x0F) == 0x0C ) { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(srcLOCAL+1) ), 16*2); // Hard lesson: XMM and YMM are not to be used together. #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(srcLOCAL+1) ), retLOCAL ); #endif retLOCAL+= (DWORDtrio>>4); srcLOCAL+= (DWORDtrio>>4)+1; } else { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), 16*2); #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), retLOCAL ); #endif srcLOCAL+= 1+(DWORDtrio&0x03); // 4|3|2|1 retLOCAL+= ((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2); // 4/8/12/16/24/32 } // Branchfull ] */ // Branchless [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); Flag=!((DWORDtrio & 0x0F)-0x0C); // In here Flag=0|1 FlagMASKnegated= Flag - 1; // -1|0 FlagMASK= ~FlagMASKnegated; #ifdef _N_YMM // SlowCopy256bit( (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), retLOCAL); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: SlowCopy256bit( (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), retLOCAL); #endif #ifdef _N_GP // memcpy(retLOCAL, (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), 16*2); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: memcpy(retLOCAL, (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), 16*2); #endif retLOCAL+= ((uint64_t)((DWORDtrio>>4))&FlagMASK) + ((uint64_t)(((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2))&FlagMASKnegated) ; srcLOCAL+= ((uint64_t)((DWORDtrio>>4)+1)&FlagMASK) + ((uint64_t)(1+(DWORDtrio&0x03))&FlagMASKnegated) ; // Branchless ] } //printf("\nloopcounter=%d\n",loopcounter); // loopcounter=29763921 for 'Autobiography_411-ebooks_Collection.tar.Lexx.Nakamichi' return (uint64_t)(retLOCAL - ret); } uint64_t Decompress014 (unsigned char* ret, unsigned char* src, uint64_t srcSize) { unsigned char* retLOCAL = ret; unsigned char* srcLOCAL = src; unsigned char* srcEndLOCAL = src+srcSize; unsigned int DWORDtrio; //unsigned int Flag; // This line made my hair white! For some reason it should be 64bit, otherwise it makes 'FlagMASKnegated' 32bit during 'FlagMASKnegated= Flag - 1; // -1|0' uint64_t Flag; uint64_t FlagMASK; //= 0xFFFFFFFFFFFFFFFF; uint64_t FlagMASKnegated; //=0x0000000000000000; //int loopcounter=0; while (srcLOCAL < srcEndLOCAL) { //loopcounter++; DWORDtrio = *(unsigned int*)srcLOCAL; //#ifndef _N_GP //#ifdef _N_prefetch_4096 // _mm_prefetch((char*)(srcLOCAL + 64*64), _MM_HINT_T0); //#endif //#endif // |1stLSB |2ndLSB |3rdLSB | // ------------------------------- // |OO|LL|xxxx|xxxxxxxx|xxxxxx|xx| // ------------------------------- // [1bit 16bit] 24bit] // OOLL = 0011 means Literal // OO = 00b MatchOffset, 0xFFFFFFFF>>(3-OO), 1 bytes long i.e. Sliding Window is 1*8-LL-OO=(1+OO)*8-4=04 or 16B // OO = 01b MatchOffset, 0xFFFFFFFF>>(3-OO), 2 bytes long i.e. Sliding Window is 2*8-LL-OO=(1+OO)*8-4=12 or 4KB // OO = 10b MatchOffset, 0xFFFFFFFF>>(3-OO), 3 bytes long i.e. Sliding Window is 3*8-LL-OO=(1+OO)*8-4=20 or 1MB // OO = 11b MatchOffset, 0xFFFFFFFF>>(3-OO), 4 bytes long i.e. Sliding Window is 4*8-LL-OO=(1+OO)*8-4=28 or 256MB // LL = 00b means 04/08/12 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 01b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 10b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 11b means 08/16/24/32 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // (1<<2<<0):1 = 4:1 priority #08 #01 12:1 = 12 // (2<<2<<0):1 = 8:1 priority #02 #02 8:1 = 8 // (3<<2<<0):1 = 12:1 priority #01 #03 16:2 = 8 // (4<<2<<0):1 = 16:1 (not used in 'Hoshimi') #04 32:4 = 8 // (1<<2<<0):2 = 4:2 priority #13 #05 12:2 = 6 // (2<<2<<0):2 = 8:2 priority #09 #06 24:4 = 6 // (3<<2<<0):2 = 12:2 priority #05 #07 16:3 = 5.3 // (4<<2<<0):2 = 16:2 priority #03 #08 4:1 = 4 // (1<<2<<0):3 = 4:3 priority #15 #09 8:2 = 4 // (2<<2<<0):3 = 8:3 priority #12 #10 12:3 = 4 // (3<<2<<0):3 = 12:3 priority #10 #11 16:4 = 4 // (4<<2<<0):3 = 16:3 priority #07 #12 8:3 = 2.6 // (1<<2<<1):4 = 8:4 priority #14 (not used in 'Hoshimi*') #13 4:2 = 2 // (2<<2<<1):4 = 16:4 priority #11 #14 8:4 = 2 // (3<<2<<1):4 = 24:4 priority #06 #15 4:3 = 1.6 // (4<<2<<1):4 = 32:4 priority #04 // In 'Hoshimi' two bit combinations were unexploited, in 'Hoshimikou' one bit combination was unexploited, in 'Lexx' none is left. /* // Branchfull [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); if ( (DWORDtrio & 0x0F) == 0x0C ) { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(srcLOCAL+1) ), 16*2); // Hard lesson: XMM and YMM are not to be used together. #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(srcLOCAL+1) ), retLOCAL ); #endif retLOCAL+= (DWORDtrio>>4); srcLOCAL+= (DWORDtrio>>4)+1; } else { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), 16*2); #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), retLOCAL ); #endif srcLOCAL+= 1+(DWORDtrio&0x03); // 4|3|2|1 retLOCAL+= ((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2); // 4/8/12/16/24/32 } // Branchfull ] */ // Branchless [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); Flag=!((DWORDtrio & 0x0F)-0x0C); // In here Flag=0|1 FlagMASKnegated= Flag - 1; // -1|0 FlagMASK= ~FlagMASKnegated; #ifdef _N_YMM // SlowCopy256bit( (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), retLOCAL); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: SlowCopy256bit( (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), retLOCAL); #endif #ifdef _N_GP // memcpy(retLOCAL, (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), 16*2); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: memcpy(retLOCAL, (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), 16*2); #endif retLOCAL+= ((uint64_t)((DWORDtrio>>4))&FlagMASK) + ((uint64_t)(((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2))&FlagMASKnegated) ; srcLOCAL+= ((uint64_t)((DWORDtrio>>4)+1)&FlagMASK) + ((uint64_t)(1+(DWORDtrio&0x03))&FlagMASKnegated) ; // Branchless ] } //printf("\nloopcounter=%d\n",loopcounter); // loopcounter=29763921 for 'Autobiography_411-ebooks_Collection.tar.Lexx.Nakamichi' return (uint64_t)(retLOCAL - ret); } uint64_t Decompress015 (unsigned char* ret, unsigned char* src, uint64_t srcSize) { unsigned char* retLOCAL = ret; unsigned char* srcLOCAL = src; unsigned char* srcEndLOCAL = src+srcSize; unsigned int DWORDtrio; //unsigned int Flag; // This line made my hair white! For some reason it should be 64bit, otherwise it makes 'FlagMASKnegated' 32bit during 'FlagMASKnegated= Flag - 1; // -1|0' uint64_t Flag; uint64_t FlagMASK; //= 0xFFFFFFFFFFFFFFFF; uint64_t FlagMASKnegated; //=0x0000000000000000; //int loopcounter=0; while (srcLOCAL < srcEndLOCAL) { //loopcounter++; DWORDtrio = *(unsigned int*)srcLOCAL; //#ifndef _N_GP //#ifdef _N_prefetch_4096 // _mm_prefetch((char*)(srcLOCAL + 64*64), _MM_HINT_T0); //#endif //#endif // |1stLSB |2ndLSB |3rdLSB | // ------------------------------- // |OO|LL|xxxx|xxxxxxxx|xxxxxx|xx| // ------------------------------- // [1bit 16bit] 24bit] // OOLL = 0011 means Literal // OO = 00b MatchOffset, 0xFFFFFFFF>>(3-OO), 1 bytes long i.e. Sliding Window is 1*8-LL-OO=(1+OO)*8-4=04 or 16B // OO = 01b MatchOffset, 0xFFFFFFFF>>(3-OO), 2 bytes long i.e. Sliding Window is 2*8-LL-OO=(1+OO)*8-4=12 or 4KB // OO = 10b MatchOffset, 0xFFFFFFFF>>(3-OO), 3 bytes long i.e. Sliding Window is 3*8-LL-OO=(1+OO)*8-4=20 or 1MB // OO = 11b MatchOffset, 0xFFFFFFFF>>(3-OO), 4 bytes long i.e. Sliding Window is 4*8-LL-OO=(1+OO)*8-4=28 or 256MB // LL = 00b means 04/08/12 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 01b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 10b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 11b means 08/16/24/32 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // (1<<2<<0):1 = 4:1 priority #08 #01 12:1 = 12 // (2<<2<<0):1 = 8:1 priority #02 #02 8:1 = 8 // (3<<2<<0):1 = 12:1 priority #01 #03 16:2 = 8 // (4<<2<<0):1 = 16:1 (not used in 'Hoshimi') #04 32:4 = 8 // (1<<2<<0):2 = 4:2 priority #13 #05 12:2 = 6 // (2<<2<<0):2 = 8:2 priority #09 #06 24:4 = 6 // (3<<2<<0):2 = 12:2 priority #05 #07 16:3 = 5.3 // (4<<2<<0):2 = 16:2 priority #03 #08 4:1 = 4 // (1<<2<<0):3 = 4:3 priority #15 #09 8:2 = 4 // (2<<2<<0):3 = 8:3 priority #12 #10 12:3 = 4 // (3<<2<<0):3 = 12:3 priority #10 #11 16:4 = 4 // (4<<2<<0):3 = 16:3 priority #07 #12 8:3 = 2.6 // (1<<2<<1):4 = 8:4 priority #14 (not used in 'Hoshimi*') #13 4:2 = 2 // (2<<2<<1):4 = 16:4 priority #11 #14 8:4 = 2 // (3<<2<<1):4 = 24:4 priority #06 #15 4:3 = 1.6 // (4<<2<<1):4 = 32:4 priority #04 // In 'Hoshimi' two bit combinations were unexploited, in 'Hoshimikou' one bit combination was unexploited, in 'Lexx' none is left. /* // Branchfull [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); if ( (DWORDtrio & 0x0F) == 0x0C ) { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(srcLOCAL+1) ), 16*2); // Hard lesson: XMM and YMM are not to be used together. #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(srcLOCAL+1) ), retLOCAL ); #endif retLOCAL+= (DWORDtrio>>4); srcLOCAL+= (DWORDtrio>>4)+1; } else { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), 16*2); #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), retLOCAL ); #endif srcLOCAL+= 1+(DWORDtrio&0x03); // 4|3|2|1 retLOCAL+= ((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2); // 4/8/12/16/24/32 } // Branchfull ] */ // Branchless [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); Flag=!((DWORDtrio & 0x0F)-0x0C); // In here Flag=0|1 FlagMASKnegated= Flag - 1; // -1|0 FlagMASK= ~FlagMASKnegated; #ifdef _N_YMM // SlowCopy256bit( (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), retLOCAL); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: SlowCopy256bit( (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), retLOCAL); #endif #ifdef _N_GP // memcpy(retLOCAL, (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), 16*2); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: memcpy(retLOCAL, (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), 16*2); #endif retLOCAL+= ((uint64_t)((DWORDtrio>>4))&FlagMASK) + ((uint64_t)(((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2))&FlagMASKnegated) ; srcLOCAL+= ((uint64_t)((DWORDtrio>>4)+1)&FlagMASK) + ((uint64_t)(1+(DWORDtrio&0x03))&FlagMASKnegated) ; // Branchless ] } //printf("\nloopcounter=%d\n",loopcounter); // loopcounter=29763921 for 'Autobiography_411-ebooks_Collection.tar.Lexx.Nakamichi' return (uint64_t)(retLOCAL - ret); } uint64_t Decompress016 (unsigned char* ret, unsigned char* src, uint64_t srcSize) { unsigned char* retLOCAL = ret; unsigned char* srcLOCAL = src; unsigned char* srcEndLOCAL = src+srcSize; unsigned int DWORDtrio; //unsigned int Flag; // This line made my hair white! For some reason it should be 64bit, otherwise it makes 'FlagMASKnegated' 32bit during 'FlagMASKnegated= Flag - 1; // -1|0' uint64_t Flag; uint64_t FlagMASK; //= 0xFFFFFFFFFFFFFFFF; uint64_t FlagMASKnegated; //=0x0000000000000000; //int loopcounter=0; while (srcLOCAL < srcEndLOCAL) { //loopcounter++; DWORDtrio = *(unsigned int*)srcLOCAL; //#ifndef _N_GP //#ifdef _N_prefetch_4096 // _mm_prefetch((char*)(srcLOCAL + 64*64), _MM_HINT_T0); //#endif //#endif // |1stLSB |2ndLSB |3rdLSB | // ------------------------------- // |OO|LL|xxxx|xxxxxxxx|xxxxxx|xx| // ------------------------------- // [1bit 16bit] 24bit] // OOLL = 0011 means Literal // OO = 00b MatchOffset, 0xFFFFFFFF>>(3-OO), 1 bytes long i.e. Sliding Window is 1*8-LL-OO=(1+OO)*8-4=04 or 16B // OO = 01b MatchOffset, 0xFFFFFFFF>>(3-OO), 2 bytes long i.e. Sliding Window is 2*8-LL-OO=(1+OO)*8-4=12 or 4KB // OO = 10b MatchOffset, 0xFFFFFFFF>>(3-OO), 3 bytes long i.e. Sliding Window is 3*8-LL-OO=(1+OO)*8-4=20 or 1MB // OO = 11b MatchOffset, 0xFFFFFFFF>>(3-OO), 4 bytes long i.e. Sliding Window is 4*8-LL-OO=(1+OO)*8-4=28 or 256MB // LL = 00b means 04/08/12 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 01b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 10b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 11b means 08/16/24/32 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // (1<<2<<0):1 = 4:1 priority #08 #01 12:1 = 12 // (2<<2<<0):1 = 8:1 priority #02 #02 8:1 = 8 // (3<<2<<0):1 = 12:1 priority #01 #03 16:2 = 8 // (4<<2<<0):1 = 16:1 (not used in 'Hoshimi') #04 32:4 = 8 // (1<<2<<0):2 = 4:2 priority #13 #05 12:2 = 6 // (2<<2<<0):2 = 8:2 priority #09 #06 24:4 = 6 // (3<<2<<0):2 = 12:2 priority #05 #07 16:3 = 5.3 // (4<<2<<0):2 = 16:2 priority #03 #08 4:1 = 4 // (1<<2<<0):3 = 4:3 priority #15 #09 8:2 = 4 // (2<<2<<0):3 = 8:3 priority #12 #10 12:3 = 4 // (3<<2<<0):3 = 12:3 priority #10 #11 16:4 = 4 // (4<<2<<0):3 = 16:3 priority #07 #12 8:3 = 2.6 // (1<<2<<1):4 = 8:4 priority #14 (not used in 'Hoshimi*') #13 4:2 = 2 // (2<<2<<1):4 = 16:4 priority #11 #14 8:4 = 2 // (3<<2<<1):4 = 24:4 priority #06 #15 4:3 = 1.6 // (4<<2<<1):4 = 32:4 priority #04 // In 'Hoshimi' two bit combinations were unexploited, in 'Hoshimikou' one bit combination was unexploited, in 'Lexx' none is left. /* // Branchfull [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); if ( (DWORDtrio & 0x0F) == 0x0C ) { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(srcLOCAL+1) ), 16*2); // Hard lesson: XMM and YMM are not to be used together. #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(srcLOCAL+1) ), retLOCAL ); #endif retLOCAL+= (DWORDtrio>>4); srcLOCAL+= (DWORDtrio>>4)+1; } else { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), 16*2); #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), retLOCAL ); #endif srcLOCAL+= 1+(DWORDtrio&0x03); // 4|3|2|1 retLOCAL+= ((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2); // 4/8/12/16/24/32 } // Branchfull ] */ // Branchless [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); Flag=!((DWORDtrio & 0x0F)-0x0C); // In here Flag=0|1 FlagMASKnegated= Flag - 1; // -1|0 FlagMASK= ~FlagMASKnegated; #ifdef _N_YMM // SlowCopy256bit( (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), retLOCAL); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: SlowCopy256bit( (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), retLOCAL); #endif #ifdef _N_GP // memcpy(retLOCAL, (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), 16*2); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: memcpy(retLOCAL, (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), 16*2); #endif retLOCAL+= ((uint64_t)((DWORDtrio>>4))&FlagMASK) + ((uint64_t)(((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2))&FlagMASKnegated) ; srcLOCAL+= ((uint64_t)((DWORDtrio>>4)+1)&FlagMASK) + ((uint64_t)(1+(DWORDtrio&0x03))&FlagMASKnegated) ; // Branchless ] } //printf("\nloopcounter=%d\n",loopcounter); // loopcounter=29763921 for 'Autobiography_411-ebooks_Collection.tar.Lexx.Nakamichi' return (uint64_t)(retLOCAL - ret); } uint64_t Decompress017 (unsigned char* ret, unsigned char* src, uint64_t srcSize) { unsigned char* retLOCAL = ret; unsigned char* srcLOCAL = src; unsigned char* srcEndLOCAL = src+srcSize; unsigned int DWORDtrio; //unsigned int Flag; // This line made my hair white! For some reason it should be 64bit, otherwise it makes 'FlagMASKnegated' 32bit during 'FlagMASKnegated= Flag - 1; // -1|0' uint64_t Flag; uint64_t FlagMASK; //= 0xFFFFFFFFFFFFFFFF; uint64_t FlagMASKnegated; //=0x0000000000000000; //int loopcounter=0; while (srcLOCAL < srcEndLOCAL) { //loopcounter++; DWORDtrio = *(unsigned int*)srcLOCAL; //#ifndef _N_GP //#ifdef _N_prefetch_4096 // _mm_prefetch((char*)(srcLOCAL + 64*64), _MM_HINT_T0); //#endif //#endif // |1stLSB |2ndLSB |3rdLSB | // ------------------------------- // |OO|LL|xxxx|xxxxxxxx|xxxxxx|xx| // ------------------------------- // [1bit 16bit] 24bit] // OOLL = 0011 means Literal // OO = 00b MatchOffset, 0xFFFFFFFF>>(3-OO), 1 bytes long i.e. Sliding Window is 1*8-LL-OO=(1+OO)*8-4=04 or 16B // OO = 01b MatchOffset, 0xFFFFFFFF>>(3-OO), 2 bytes long i.e. Sliding Window is 2*8-LL-OO=(1+OO)*8-4=12 or 4KB // OO = 10b MatchOffset, 0xFFFFFFFF>>(3-OO), 3 bytes long i.e. Sliding Window is 3*8-LL-OO=(1+OO)*8-4=20 or 1MB // OO = 11b MatchOffset, 0xFFFFFFFF>>(3-OO), 4 bytes long i.e. Sliding Window is 4*8-LL-OO=(1+OO)*8-4=28 or 256MB // LL = 00b means 04/08/12 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 01b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 10b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 11b means 08/16/24/32 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // (1<<2<<0):1 = 4:1 priority #08 #01 12:1 = 12 // (2<<2<<0):1 = 8:1 priority #02 #02 8:1 = 8 // (3<<2<<0):1 = 12:1 priority #01 #03 16:2 = 8 // (4<<2<<0):1 = 16:1 (not used in 'Hoshimi') #04 32:4 = 8 // (1<<2<<0):2 = 4:2 priority #13 #05 12:2 = 6 // (2<<2<<0):2 = 8:2 priority #09 #06 24:4 = 6 // (3<<2<<0):2 = 12:2 priority #05 #07 16:3 = 5.3 // (4<<2<<0):2 = 16:2 priority #03 #08 4:1 = 4 // (1<<2<<0):3 = 4:3 priority #15 #09 8:2 = 4 // (2<<2<<0):3 = 8:3 priority #12 #10 12:3 = 4 // (3<<2<<0):3 = 12:3 priority #10 #11 16:4 = 4 // (4<<2<<0):3 = 16:3 priority #07 #12 8:3 = 2.6 // (1<<2<<1):4 = 8:4 priority #14 (not used in 'Hoshimi*') #13 4:2 = 2 // (2<<2<<1):4 = 16:4 priority #11 #14 8:4 = 2 // (3<<2<<1):4 = 24:4 priority #06 #15 4:3 = 1.6 // (4<<2<<1):4 = 32:4 priority #04 // In 'Hoshimi' two bit combinations were unexploited, in 'Hoshimikou' one bit combination was unexploited, in 'Lexx' none is left. /* // Branchfull [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); if ( (DWORDtrio & 0x0F) == 0x0C ) { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(srcLOCAL+1) ), 16*2); // Hard lesson: XMM and YMM are not to be used together. #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(srcLOCAL+1) ), retLOCAL ); #endif retLOCAL+= (DWORDtrio>>4); srcLOCAL+= (DWORDtrio>>4)+1; } else { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), 16*2); #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), retLOCAL ); #endif srcLOCAL+= 1+(DWORDtrio&0x03); // 4|3|2|1 retLOCAL+= ((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2); // 4/8/12/16/24/32 } // Branchfull ] */ // Branchless [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); Flag=!((DWORDtrio & 0x0F)-0x0C); // In here Flag=0|1 FlagMASKnegated= Flag - 1; // -1|0 FlagMASK= ~FlagMASKnegated; #ifdef _N_YMM // SlowCopy256bit( (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), retLOCAL); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: SlowCopy256bit( (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), retLOCAL); #endif #ifdef _N_GP // memcpy(retLOCAL, (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), 16*2); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: memcpy(retLOCAL, (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), 16*2); #endif retLOCAL+= ((uint64_t)((DWORDtrio>>4))&FlagMASK) + ((uint64_t)(((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2))&FlagMASKnegated) ; srcLOCAL+= ((uint64_t)((DWORDtrio>>4)+1)&FlagMASK) + ((uint64_t)(1+(DWORDtrio&0x03))&FlagMASKnegated) ; // Branchless ] } //printf("\nloopcounter=%d\n",loopcounter); // loopcounter=29763921 for 'Autobiography_411-ebooks_Collection.tar.Lexx.Nakamichi' return (uint64_t)(retLOCAL - ret); } uint64_t Decompress018 (unsigned char* ret, unsigned char* src, uint64_t srcSize) { unsigned char* retLOCAL = ret; unsigned char* srcLOCAL = src; unsigned char* srcEndLOCAL = src+srcSize; unsigned int DWORDtrio; //unsigned int Flag; // This line made my hair white! For some reason it should be 64bit, otherwise it makes 'FlagMASKnegated' 32bit during 'FlagMASKnegated= Flag - 1; // -1|0' uint64_t Flag; uint64_t FlagMASK; //= 0xFFFFFFFFFFFFFFFF; uint64_t FlagMASKnegated; //=0x0000000000000000; //int loopcounter=0; while (srcLOCAL < srcEndLOCAL) { //loopcounter++; DWORDtrio = *(unsigned int*)srcLOCAL; //#ifndef _N_GP //#ifdef _N_prefetch_4096 // _mm_prefetch((char*)(srcLOCAL + 64*64), _MM_HINT_T0); //#endif //#endif // |1stLSB |2ndLSB |3rdLSB | // ------------------------------- // |OO|LL|xxxx|xxxxxxxx|xxxxxx|xx| // ------------------------------- // [1bit 16bit] 24bit] // OOLL = 0011 means Literal // OO = 00b MatchOffset, 0xFFFFFFFF>>(3-OO), 1 bytes long i.e. Sliding Window is 1*8-LL-OO=(1+OO)*8-4=04 or 16B // OO = 01b MatchOffset, 0xFFFFFFFF>>(3-OO), 2 bytes long i.e. Sliding Window is 2*8-LL-OO=(1+OO)*8-4=12 or 4KB // OO = 10b MatchOffset, 0xFFFFFFFF>>(3-OO), 3 bytes long i.e. Sliding Window is 3*8-LL-OO=(1+OO)*8-4=20 or 1MB // OO = 11b MatchOffset, 0xFFFFFFFF>>(3-OO), 4 bytes long i.e. Sliding Window is 4*8-LL-OO=(1+OO)*8-4=28 or 256MB // LL = 00b means 04/08/12 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 01b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 10b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 11b means 08/16/24/32 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // (1<<2<<0):1 = 4:1 priority #08 #01 12:1 = 12 // (2<<2<<0):1 = 8:1 priority #02 #02 8:1 = 8 // (3<<2<<0):1 = 12:1 priority #01 #03 16:2 = 8 // (4<<2<<0):1 = 16:1 (not used in 'Hoshimi') #04 32:4 = 8 // (1<<2<<0):2 = 4:2 priority #13 #05 12:2 = 6 // (2<<2<<0):2 = 8:2 priority #09 #06 24:4 = 6 // (3<<2<<0):2 = 12:2 priority #05 #07 16:3 = 5.3 // (4<<2<<0):2 = 16:2 priority #03 #08 4:1 = 4 // (1<<2<<0):3 = 4:3 priority #15 #09 8:2 = 4 // (2<<2<<0):3 = 8:3 priority #12 #10 12:3 = 4 // (3<<2<<0):3 = 12:3 priority #10 #11 16:4 = 4 // (4<<2<<0):3 = 16:3 priority #07 #12 8:3 = 2.6 // (1<<2<<1):4 = 8:4 priority #14 (not used in 'Hoshimi*') #13 4:2 = 2 // (2<<2<<1):4 = 16:4 priority #11 #14 8:4 = 2 // (3<<2<<1):4 = 24:4 priority #06 #15 4:3 = 1.6 // (4<<2<<1):4 = 32:4 priority #04 // In 'Hoshimi' two bit combinations were unexploited, in 'Hoshimikou' one bit combination was unexploited, in 'Lexx' none is left. /* // Branchfull [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); if ( (DWORDtrio & 0x0F) == 0x0C ) { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(srcLOCAL+1) ), 16*2); // Hard lesson: XMM and YMM are not to be used together. #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(srcLOCAL+1) ), retLOCAL ); #endif retLOCAL+= (DWORDtrio>>4); srcLOCAL+= (DWORDtrio>>4)+1; } else { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), 16*2); #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), retLOCAL ); #endif srcLOCAL+= 1+(DWORDtrio&0x03); // 4|3|2|1 retLOCAL+= ((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2); // 4/8/12/16/24/32 } // Branchfull ] */ // Branchless [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); Flag=!((DWORDtrio & 0x0F)-0x0C); // In here Flag=0|1 FlagMASKnegated= Flag - 1; // -1|0 FlagMASK= ~FlagMASKnegated; #ifdef _N_YMM // SlowCopy256bit( (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), retLOCAL); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: SlowCopy256bit( (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), retLOCAL); #endif #ifdef _N_GP // memcpy(retLOCAL, (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), 16*2); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: memcpy(retLOCAL, (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), 16*2); #endif retLOCAL+= ((uint64_t)((DWORDtrio>>4))&FlagMASK) + ((uint64_t)(((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2))&FlagMASKnegated) ; srcLOCAL+= ((uint64_t)((DWORDtrio>>4)+1)&FlagMASK) + ((uint64_t)(1+(DWORDtrio&0x03))&FlagMASKnegated) ; // Branchless ] } //printf("\nloopcounter=%d\n",loopcounter); // loopcounter=29763921 for 'Autobiography_411-ebooks_Collection.tar.Lexx.Nakamichi' return (uint64_t)(retLOCAL - ret); } uint64_t Decompress019 (unsigned char* ret, unsigned char* src, uint64_t srcSize) { unsigned char* retLOCAL = ret; unsigned char* srcLOCAL = src; unsigned char* srcEndLOCAL = src+srcSize; unsigned int DWORDtrio; //unsigned int Flag; // This line made my hair white! For some reason it should be 64bit, otherwise it makes 'FlagMASKnegated' 32bit during 'FlagMASKnegated= Flag - 1; // -1|0' uint64_t Flag; uint64_t FlagMASK; //= 0xFFFFFFFFFFFFFFFF; uint64_t FlagMASKnegated; //=0x0000000000000000; //int loopcounter=0; while (srcLOCAL < srcEndLOCAL) { //loopcounter++; DWORDtrio = *(unsigned int*)srcLOCAL; //#ifndef _N_GP //#ifdef _N_prefetch_4096 // _mm_prefetch((char*)(srcLOCAL + 64*64), _MM_HINT_T0); //#endif //#endif // |1stLSB |2ndLSB |3rdLSB | // ------------------------------- // |OO|LL|xxxx|xxxxxxxx|xxxxxx|xx| // ------------------------------- // [1bit 16bit] 24bit] // OOLL = 0011 means Literal // OO = 00b MatchOffset, 0xFFFFFFFF>>(3-OO), 1 bytes long i.e. Sliding Window is 1*8-LL-OO=(1+OO)*8-4=04 or 16B // OO = 01b MatchOffset, 0xFFFFFFFF>>(3-OO), 2 bytes long i.e. Sliding Window is 2*8-LL-OO=(1+OO)*8-4=12 or 4KB // OO = 10b MatchOffset, 0xFFFFFFFF>>(3-OO), 3 bytes long i.e. Sliding Window is 3*8-LL-OO=(1+OO)*8-4=20 or 1MB // OO = 11b MatchOffset, 0xFFFFFFFF>>(3-OO), 4 bytes long i.e. Sliding Window is 4*8-LL-OO=(1+OO)*8-4=28 or 256MB // LL = 00b means 04/08/12 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 01b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 10b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 11b means 08/16/24/32 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // (1<<2<<0):1 = 4:1 priority #08 #01 12:1 = 12 // (2<<2<<0):1 = 8:1 priority #02 #02 8:1 = 8 // (3<<2<<0):1 = 12:1 priority #01 #03 16:2 = 8 // (4<<2<<0):1 = 16:1 (not used in 'Hoshimi') #04 32:4 = 8 // (1<<2<<0):2 = 4:2 priority #13 #05 12:2 = 6 // (2<<2<<0):2 = 8:2 priority #09 #06 24:4 = 6 // (3<<2<<0):2 = 12:2 priority #05 #07 16:3 = 5.3 // (4<<2<<0):2 = 16:2 priority #03 #08 4:1 = 4 // (1<<2<<0):3 = 4:3 priority #15 #09 8:2 = 4 // (2<<2<<0):3 = 8:3 priority #12 #10 12:3 = 4 // (3<<2<<0):3 = 12:3 priority #10 #11 16:4 = 4 // (4<<2<<0):3 = 16:3 priority #07 #12 8:3 = 2.6 // (1<<2<<1):4 = 8:4 priority #14 (not used in 'Hoshimi*') #13 4:2 = 2 // (2<<2<<1):4 = 16:4 priority #11 #14 8:4 = 2 // (3<<2<<1):4 = 24:4 priority #06 #15 4:3 = 1.6 // (4<<2<<1):4 = 32:4 priority #04 // In 'Hoshimi' two bit combinations were unexploited, in 'Hoshimikou' one bit combination was unexploited, in 'Lexx' none is left. /* // Branchfull [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); if ( (DWORDtrio & 0x0F) == 0x0C ) { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(srcLOCAL+1) ), 16*2); // Hard lesson: XMM and YMM are not to be used together. #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(srcLOCAL+1) ), retLOCAL ); #endif retLOCAL+= (DWORDtrio>>4); srcLOCAL+= (DWORDtrio>>4)+1; } else { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), 16*2); #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), retLOCAL ); #endif srcLOCAL+= 1+(DWORDtrio&0x03); // 4|3|2|1 retLOCAL+= ((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2); // 4/8/12/16/24/32 } // Branchfull ] */ // Branchless [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); Flag=!((DWORDtrio & 0x0F)-0x0C); // In here Flag=0|1 FlagMASKnegated= Flag - 1; // -1|0 FlagMASK= ~FlagMASKnegated; #ifdef _N_YMM // SlowCopy256bit( (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), retLOCAL); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: SlowCopy256bit( (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), retLOCAL); #endif #ifdef _N_GP // memcpy(retLOCAL, (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), 16*2); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: memcpy(retLOCAL, (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), 16*2); #endif retLOCAL+= ((uint64_t)((DWORDtrio>>4))&FlagMASK) + ((uint64_t)(((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2))&FlagMASKnegated) ; srcLOCAL+= ((uint64_t)((DWORDtrio>>4)+1)&FlagMASK) + ((uint64_t)(1+(DWORDtrio&0x03))&FlagMASKnegated) ; // Branchless ] } //printf("\nloopcounter=%d\n",loopcounter); // loopcounter=29763921 for 'Autobiography_411-ebooks_Collection.tar.Lexx.Nakamichi' return (uint64_t)(retLOCAL - ret); } uint64_t Decompress020 (unsigned char* ret, unsigned char* src, uint64_t srcSize) { unsigned char* retLOCAL = ret; unsigned char* srcLOCAL = src; unsigned char* srcEndLOCAL = src+srcSize; unsigned int DWORDtrio; //unsigned int Flag; // This line made my hair white! For some reason it should be 64bit, otherwise it makes 'FlagMASKnegated' 32bit during 'FlagMASKnegated= Flag - 1; // -1|0' uint64_t Flag; uint64_t FlagMASK; //= 0xFFFFFFFFFFFFFFFF; uint64_t FlagMASKnegated; //=0x0000000000000000; //int loopcounter=0; while (srcLOCAL < srcEndLOCAL) { //loopcounter++; DWORDtrio = *(unsigned int*)srcLOCAL; //#ifndef _N_GP //#ifdef _N_prefetch_4096 // _mm_prefetch((char*)(srcLOCAL + 64*64), _MM_HINT_T0); //#endif //#endif // |1stLSB |2ndLSB |3rdLSB | // ------------------------------- // |OO|LL|xxxx|xxxxxxxx|xxxxxx|xx| // ------------------------------- // [1bit 16bit] 24bit] // OOLL = 0011 means Literal // OO = 00b MatchOffset, 0xFFFFFFFF>>(3-OO), 1 bytes long i.e. Sliding Window is 1*8-LL-OO=(1+OO)*8-4=04 or 16B // OO = 01b MatchOffset, 0xFFFFFFFF>>(3-OO), 2 bytes long i.e. Sliding Window is 2*8-LL-OO=(1+OO)*8-4=12 or 4KB // OO = 10b MatchOffset, 0xFFFFFFFF>>(3-OO), 3 bytes long i.e. Sliding Window is 3*8-LL-OO=(1+OO)*8-4=20 or 1MB // OO = 11b MatchOffset, 0xFFFFFFFF>>(3-OO), 4 bytes long i.e. Sliding Window is 4*8-LL-OO=(1+OO)*8-4=28 or 256MB // LL = 00b means 04/08/12 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 01b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 10b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 11b means 08/16/24/32 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // (1<<2<<0):1 = 4:1 priority #08 #01 12:1 = 12 // (2<<2<<0):1 = 8:1 priority #02 #02 8:1 = 8 // (3<<2<<0):1 = 12:1 priority #01 #03 16:2 = 8 // (4<<2<<0):1 = 16:1 (not used in 'Hoshimi') #04 32:4 = 8 // (1<<2<<0):2 = 4:2 priority #13 #05 12:2 = 6 // (2<<2<<0):2 = 8:2 priority #09 #06 24:4 = 6 // (3<<2<<0):2 = 12:2 priority #05 #07 16:3 = 5.3 // (4<<2<<0):2 = 16:2 priority #03 #08 4:1 = 4 // (1<<2<<0):3 = 4:3 priority #15 #09 8:2 = 4 // (2<<2<<0):3 = 8:3 priority #12 #10 12:3 = 4 // (3<<2<<0):3 = 12:3 priority #10 #11 16:4 = 4 // (4<<2<<0):3 = 16:3 priority #07 #12 8:3 = 2.6 // (1<<2<<1):4 = 8:4 priority #14 (not used in 'Hoshimi*') #13 4:2 = 2 // (2<<2<<1):4 = 16:4 priority #11 #14 8:4 = 2 // (3<<2<<1):4 = 24:4 priority #06 #15 4:3 = 1.6 // (4<<2<<1):4 = 32:4 priority #04 // In 'Hoshimi' two bit combinations were unexploited, in 'Hoshimikou' one bit combination was unexploited, in 'Lexx' none is left. /* // Branchfull [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); if ( (DWORDtrio & 0x0F) == 0x0C ) { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(srcLOCAL+1) ), 16*2); // Hard lesson: XMM and YMM are not to be used together. #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(srcLOCAL+1) ), retLOCAL ); #endif retLOCAL+= (DWORDtrio>>4); srcLOCAL+= (DWORDtrio>>4)+1; } else { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), 16*2); #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), retLOCAL ); #endif srcLOCAL+= 1+(DWORDtrio&0x03); // 4|3|2|1 retLOCAL+= ((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2); // 4/8/12/16/24/32 } // Branchfull ] */ // Branchless [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); Flag=!((DWORDtrio & 0x0F)-0x0C); // In here Flag=0|1 FlagMASKnegated= Flag - 1; // -1|0 FlagMASK= ~FlagMASKnegated; #ifdef _N_YMM // SlowCopy256bit( (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), retLOCAL); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: SlowCopy256bit( (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), retLOCAL); #endif #ifdef _N_GP // memcpy(retLOCAL, (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), 16*2); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: memcpy(retLOCAL, (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), 16*2); #endif retLOCAL+= ((uint64_t)((DWORDtrio>>4))&FlagMASK) + ((uint64_t)(((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2))&FlagMASKnegated) ; srcLOCAL+= ((uint64_t)((DWORDtrio>>4)+1)&FlagMASK) + ((uint64_t)(1+(DWORDtrio&0x03))&FlagMASKnegated) ; // Branchless ] } //printf("\nloopcounter=%d\n",loopcounter); // loopcounter=29763921 for 'Autobiography_411-ebooks_Collection.tar.Lexx.Nakamichi' return (uint64_t)(retLOCAL - ret); } uint64_t Decompress021 (unsigned char* ret, unsigned char* src, uint64_t srcSize) { unsigned char* retLOCAL = ret; unsigned char* srcLOCAL = src; unsigned char* srcEndLOCAL = src+srcSize; unsigned int DWORDtrio; //unsigned int Flag; // This line made my hair white! For some reason it should be 64bit, otherwise it makes 'FlagMASKnegated' 32bit during 'FlagMASKnegated= Flag - 1; // -1|0' uint64_t Flag; uint64_t FlagMASK; //= 0xFFFFFFFFFFFFFFFF; uint64_t FlagMASKnegated; //=0x0000000000000000; //int loopcounter=0; while (srcLOCAL < srcEndLOCAL) { //loopcounter++; DWORDtrio = *(unsigned int*)srcLOCAL; //#ifndef _N_GP //#ifdef _N_prefetch_4096 // _mm_prefetch((char*)(srcLOCAL + 64*64), _MM_HINT_T0); //#endif //#endif // |1stLSB |2ndLSB |3rdLSB | // ------------------------------- // |OO|LL|xxxx|xxxxxxxx|xxxxxx|xx| // ------------------------------- // [1bit 16bit] 24bit] // OOLL = 0011 means Literal // OO = 00b MatchOffset, 0xFFFFFFFF>>(3-OO), 1 bytes long i.e. Sliding Window is 1*8-LL-OO=(1+OO)*8-4=04 or 16B // OO = 01b MatchOffset, 0xFFFFFFFF>>(3-OO), 2 bytes long i.e. Sliding Window is 2*8-LL-OO=(1+OO)*8-4=12 or 4KB // OO = 10b MatchOffset, 0xFFFFFFFF>>(3-OO), 3 bytes long i.e. Sliding Window is 3*8-LL-OO=(1+OO)*8-4=20 or 1MB // OO = 11b MatchOffset, 0xFFFFFFFF>>(3-OO), 4 bytes long i.e. Sliding Window is 4*8-LL-OO=(1+OO)*8-4=28 or 256MB // LL = 00b means 04/08/12 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 01b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 10b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 11b means 08/16/24/32 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // (1<<2<<0):1 = 4:1 priority #08 #01 12:1 = 12 // (2<<2<<0):1 = 8:1 priority #02 #02 8:1 = 8 // (3<<2<<0):1 = 12:1 priority #01 #03 16:2 = 8 // (4<<2<<0):1 = 16:1 (not used in 'Hoshimi') #04 32:4 = 8 // (1<<2<<0):2 = 4:2 priority #13 #05 12:2 = 6 // (2<<2<<0):2 = 8:2 priority #09 #06 24:4 = 6 // (3<<2<<0):2 = 12:2 priority #05 #07 16:3 = 5.3 // (4<<2<<0):2 = 16:2 priority #03 #08 4:1 = 4 // (1<<2<<0):3 = 4:3 priority #15 #09 8:2 = 4 // (2<<2<<0):3 = 8:3 priority #12 #10 12:3 = 4 // (3<<2<<0):3 = 12:3 priority #10 #11 16:4 = 4 // (4<<2<<0):3 = 16:3 priority #07 #12 8:3 = 2.6 // (1<<2<<1):4 = 8:4 priority #14 (not used in 'Hoshimi*') #13 4:2 = 2 // (2<<2<<1):4 = 16:4 priority #11 #14 8:4 = 2 // (3<<2<<1):4 = 24:4 priority #06 #15 4:3 = 1.6 // (4<<2<<1):4 = 32:4 priority #04 // In 'Hoshimi' two bit combinations were unexploited, in 'Hoshimikou' one bit combination was unexploited, in 'Lexx' none is left. /* // Branchfull [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); if ( (DWORDtrio & 0x0F) == 0x0C ) { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(srcLOCAL+1) ), 16*2); // Hard lesson: XMM and YMM are not to be used together. #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(srcLOCAL+1) ), retLOCAL ); #endif retLOCAL+= (DWORDtrio>>4); srcLOCAL+= (DWORDtrio>>4)+1; } else { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), 16*2); #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), retLOCAL ); #endif srcLOCAL+= 1+(DWORDtrio&0x03); // 4|3|2|1 retLOCAL+= ((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2); // 4/8/12/16/24/32 } // Branchfull ] */ // Branchless [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); Flag=!((DWORDtrio & 0x0F)-0x0C); // In here Flag=0|1 FlagMASKnegated= Flag - 1; // -1|0 FlagMASK= ~FlagMASKnegated; #ifdef _N_YMM // SlowCopy256bit( (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), retLOCAL); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: SlowCopy256bit( (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), retLOCAL); #endif #ifdef _N_GP // memcpy(retLOCAL, (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), 16*2); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: memcpy(retLOCAL, (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), 16*2); #endif retLOCAL+= ((uint64_t)((DWORDtrio>>4))&FlagMASK) + ((uint64_t)(((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2))&FlagMASKnegated) ; srcLOCAL+= ((uint64_t)((DWORDtrio>>4)+1)&FlagMASK) + ((uint64_t)(1+(DWORDtrio&0x03))&FlagMASKnegated) ; // Branchless ] } //printf("\nloopcounter=%d\n",loopcounter); // loopcounter=29763921 for 'Autobiography_411-ebooks_Collection.tar.Lexx.Nakamichi' return (uint64_t)(retLOCAL - ret); } uint64_t Decompress022 (unsigned char* ret, unsigned char* src, uint64_t srcSize) { unsigned char* retLOCAL = ret; unsigned char* srcLOCAL = src; unsigned char* srcEndLOCAL = src+srcSize; unsigned int DWORDtrio; //unsigned int Flag; // This line made my hair white! For some reason it should be 64bit, otherwise it makes 'FlagMASKnegated' 32bit during 'FlagMASKnegated= Flag - 1; // -1|0' uint64_t Flag; uint64_t FlagMASK; //= 0xFFFFFFFFFFFFFFFF; uint64_t FlagMASKnegated; //=0x0000000000000000; //int loopcounter=0; while (srcLOCAL < srcEndLOCAL) { //loopcounter++; DWORDtrio = *(unsigned int*)srcLOCAL; //#ifndef _N_GP //#ifdef _N_prefetch_4096 // _mm_prefetch((char*)(srcLOCAL + 64*64), _MM_HINT_T0); //#endif //#endif // |1stLSB |2ndLSB |3rdLSB | // ------------------------------- // |OO|LL|xxxx|xxxxxxxx|xxxxxx|xx| // ------------------------------- // [1bit 16bit] 24bit] // OOLL = 0011 means Literal // OO = 00b MatchOffset, 0xFFFFFFFF>>(3-OO), 1 bytes long i.e. Sliding Window is 1*8-LL-OO=(1+OO)*8-4=04 or 16B // OO = 01b MatchOffset, 0xFFFFFFFF>>(3-OO), 2 bytes long i.e. Sliding Window is 2*8-LL-OO=(1+OO)*8-4=12 or 4KB // OO = 10b MatchOffset, 0xFFFFFFFF>>(3-OO), 3 bytes long i.e. Sliding Window is 3*8-LL-OO=(1+OO)*8-4=20 or 1MB // OO = 11b MatchOffset, 0xFFFFFFFF>>(3-OO), 4 bytes long i.e. Sliding Window is 4*8-LL-OO=(1+OO)*8-4=28 or 256MB // LL = 00b means 04/08/12 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 01b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 10b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 11b means 08/16/24/32 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // (1<<2<<0):1 = 4:1 priority #08 #01 12:1 = 12 // (2<<2<<0):1 = 8:1 priority #02 #02 8:1 = 8 // (3<<2<<0):1 = 12:1 priority #01 #03 16:2 = 8 // (4<<2<<0):1 = 16:1 (not used in 'Hoshimi') #04 32:4 = 8 // (1<<2<<0):2 = 4:2 priority #13 #05 12:2 = 6 // (2<<2<<0):2 = 8:2 priority #09 #06 24:4 = 6 // (3<<2<<0):2 = 12:2 priority #05 #07 16:3 = 5.3 // (4<<2<<0):2 = 16:2 priority #03 #08 4:1 = 4 // (1<<2<<0):3 = 4:3 priority #15 #09 8:2 = 4 // (2<<2<<0):3 = 8:3 priority #12 #10 12:3 = 4 // (3<<2<<0):3 = 12:3 priority #10 #11 16:4 = 4 // (4<<2<<0):3 = 16:3 priority #07 #12 8:3 = 2.6 // (1<<2<<1):4 = 8:4 priority #14 (not used in 'Hoshimi*') #13 4:2 = 2 // (2<<2<<1):4 = 16:4 priority #11 #14 8:4 = 2 // (3<<2<<1):4 = 24:4 priority #06 #15 4:3 = 1.6 // (4<<2<<1):4 = 32:4 priority #04 // In 'Hoshimi' two bit combinations were unexploited, in 'Hoshimikou' one bit combination was unexploited, in 'Lexx' none is left. /* // Branchfull [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); if ( (DWORDtrio & 0x0F) == 0x0C ) { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(srcLOCAL+1) ), 16*2); // Hard lesson: XMM and YMM are not to be used together. #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(srcLOCAL+1) ), retLOCAL ); #endif retLOCAL+= (DWORDtrio>>4); srcLOCAL+= (DWORDtrio>>4)+1; } else { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), 16*2); #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), retLOCAL ); #endif srcLOCAL+= 1+(DWORDtrio&0x03); // 4|3|2|1 retLOCAL+= ((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2); // 4/8/12/16/24/32 } // Branchfull ] */ // Branchless [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); Flag=!((DWORDtrio & 0x0F)-0x0C); // In here Flag=0|1 FlagMASKnegated= Flag - 1; // -1|0 FlagMASK= ~FlagMASKnegated; #ifdef _N_YMM // SlowCopy256bit( (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), retLOCAL); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: SlowCopy256bit( (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), retLOCAL); #endif #ifdef _N_GP // memcpy(retLOCAL, (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), 16*2); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: memcpy(retLOCAL, (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), 16*2); #endif retLOCAL+= ((uint64_t)((DWORDtrio>>4))&FlagMASK) + ((uint64_t)(((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2))&FlagMASKnegated) ; srcLOCAL+= ((uint64_t)((DWORDtrio>>4)+1)&FlagMASK) + ((uint64_t)(1+(DWORDtrio&0x03))&FlagMASKnegated) ; // Branchless ] } //printf("\nloopcounter=%d\n",loopcounter); // loopcounter=29763921 for 'Autobiography_411-ebooks_Collection.tar.Lexx.Nakamichi' return (uint64_t)(retLOCAL - ret); } uint64_t Decompress023 (unsigned char* ret, unsigned char* src, uint64_t srcSize) { unsigned char* retLOCAL = ret; unsigned char* srcLOCAL = src; unsigned char* srcEndLOCAL = src+srcSize; unsigned int DWORDtrio; //unsigned int Flag; // This line made my hair white! For some reason it should be 64bit, otherwise it makes 'FlagMASKnegated' 32bit during 'FlagMASKnegated= Flag - 1; // -1|0' uint64_t Flag; uint64_t FlagMASK; //= 0xFFFFFFFFFFFFFFFF; uint64_t FlagMASKnegated; //=0x0000000000000000; //int loopcounter=0; while (srcLOCAL < srcEndLOCAL) { //loopcounter++; DWORDtrio = *(unsigned int*)srcLOCAL; //#ifndef _N_GP //#ifdef _N_prefetch_4096 // _mm_prefetch((char*)(srcLOCAL + 64*64), _MM_HINT_T0); //#endif //#endif // |1stLSB |2ndLSB |3rdLSB | // ------------------------------- // |OO|LL|xxxx|xxxxxxxx|xxxxxx|xx| // ------------------------------- // [1bit 16bit] 24bit] // OOLL = 0011 means Literal // OO = 00b MatchOffset, 0xFFFFFFFF>>(3-OO), 1 bytes long i.e. Sliding Window is 1*8-LL-OO=(1+OO)*8-4=04 or 16B // OO = 01b MatchOffset, 0xFFFFFFFF>>(3-OO), 2 bytes long i.e. Sliding Window is 2*8-LL-OO=(1+OO)*8-4=12 or 4KB // OO = 10b MatchOffset, 0xFFFFFFFF>>(3-OO), 3 bytes long i.e. Sliding Window is 3*8-LL-OO=(1+OO)*8-4=20 or 1MB // OO = 11b MatchOffset, 0xFFFFFFFF>>(3-OO), 4 bytes long i.e. Sliding Window is 4*8-LL-OO=(1+OO)*8-4=28 or 256MB // LL = 00b means 04/08/12 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 01b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 10b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 11b means 08/16/24/32 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // (1<<2<<0):1 = 4:1 priority #08 #01 12:1 = 12 // (2<<2<<0):1 = 8:1 priority #02 #02 8:1 = 8 // (3<<2<<0):1 = 12:1 priority #01 #03 16:2 = 8 // (4<<2<<0):1 = 16:1 (not used in 'Hoshimi') #04 32:4 = 8 // (1<<2<<0):2 = 4:2 priority #13 #05 12:2 = 6 // (2<<2<<0):2 = 8:2 priority #09 #06 24:4 = 6 // (3<<2<<0):2 = 12:2 priority #05 #07 16:3 = 5.3 // (4<<2<<0):2 = 16:2 priority #03 #08 4:1 = 4 // (1<<2<<0):3 = 4:3 priority #15 #09 8:2 = 4 // (2<<2<<0):3 = 8:3 priority #12 #10 12:3 = 4 // (3<<2<<0):3 = 12:3 priority #10 #11 16:4 = 4 // (4<<2<<0):3 = 16:3 priority #07 #12 8:3 = 2.6 // (1<<2<<1):4 = 8:4 priority #14 (not used in 'Hoshimi*') #13 4:2 = 2 // (2<<2<<1):4 = 16:4 priority #11 #14 8:4 = 2 // (3<<2<<1):4 = 24:4 priority #06 #15 4:3 = 1.6 // (4<<2<<1):4 = 32:4 priority #04 // In 'Hoshimi' two bit combinations were unexploited, in 'Hoshimikou' one bit combination was unexploited, in 'Lexx' none is left. /* // Branchfull [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); if ( (DWORDtrio & 0x0F) == 0x0C ) { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(srcLOCAL+1) ), 16*2); // Hard lesson: XMM and YMM are not to be used together. #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(srcLOCAL+1) ), retLOCAL ); #endif retLOCAL+= (DWORDtrio>>4); srcLOCAL+= (DWORDtrio>>4)+1; } else { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), 16*2); #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), retLOCAL ); #endif srcLOCAL+= 1+(DWORDtrio&0x03); // 4|3|2|1 retLOCAL+= ((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2); // 4/8/12/16/24/32 } // Branchfull ] */ // Branchless [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); Flag=!((DWORDtrio & 0x0F)-0x0C); // In here Flag=0|1 FlagMASKnegated= Flag - 1; // -1|0 FlagMASK= ~FlagMASKnegated; #ifdef _N_YMM // SlowCopy256bit( (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), retLOCAL); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: SlowCopy256bit( (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), retLOCAL); #endif #ifdef _N_GP // memcpy(retLOCAL, (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), 16*2); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: memcpy(retLOCAL, (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), 16*2); #endif retLOCAL+= ((uint64_t)((DWORDtrio>>4))&FlagMASK) + ((uint64_t)(((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2))&FlagMASKnegated) ; srcLOCAL+= ((uint64_t)((DWORDtrio>>4)+1)&FlagMASK) + ((uint64_t)(1+(DWORDtrio&0x03))&FlagMASKnegated) ; // Branchless ] } //printf("\nloopcounter=%d\n",loopcounter); // loopcounter=29763921 for 'Autobiography_411-ebooks_Collection.tar.Lexx.Nakamichi' return (uint64_t)(retLOCAL - ret); } uint64_t Decompress024 (unsigned char* ret, unsigned char* src, uint64_t srcSize) { unsigned char* retLOCAL = ret; unsigned char* srcLOCAL = src; unsigned char* srcEndLOCAL = src+srcSize; unsigned int DWORDtrio; //unsigned int Flag; // This line made my hair white! For some reason it should be 64bit, otherwise it makes 'FlagMASKnegated' 32bit during 'FlagMASKnegated= Flag - 1; // -1|0' uint64_t Flag; uint64_t FlagMASK; //= 0xFFFFFFFFFFFFFFFF; uint64_t FlagMASKnegated; //=0x0000000000000000; //int loopcounter=0; while (srcLOCAL < srcEndLOCAL) { //loopcounter++; DWORDtrio = *(unsigned int*)srcLOCAL; //#ifndef _N_GP //#ifdef _N_prefetch_4096 // _mm_prefetch((char*)(srcLOCAL + 64*64), _MM_HINT_T0); //#endif //#endif // |1stLSB |2ndLSB |3rdLSB | // ------------------------------- // |OO|LL|xxxx|xxxxxxxx|xxxxxx|xx| // ------------------------------- // [1bit 16bit] 24bit] // OOLL = 0011 means Literal // OO = 00b MatchOffset, 0xFFFFFFFF>>(3-OO), 1 bytes long i.e. Sliding Window is 1*8-LL-OO=(1+OO)*8-4=04 or 16B // OO = 01b MatchOffset, 0xFFFFFFFF>>(3-OO), 2 bytes long i.e. Sliding Window is 2*8-LL-OO=(1+OO)*8-4=12 or 4KB // OO = 10b MatchOffset, 0xFFFFFFFF>>(3-OO), 3 bytes long i.e. Sliding Window is 3*8-LL-OO=(1+OO)*8-4=20 or 1MB // OO = 11b MatchOffset, 0xFFFFFFFF>>(3-OO), 4 bytes long i.e. Sliding Window is 4*8-LL-OO=(1+OO)*8-4=28 or 256MB // LL = 00b means 04/08/12 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 01b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 10b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 11b means 08/16/24/32 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // (1<<2<<0):1 = 4:1 priority #08 #01 12:1 = 12 // (2<<2<<0):1 = 8:1 priority #02 #02 8:1 = 8 // (3<<2<<0):1 = 12:1 priority #01 #03 16:2 = 8 // (4<<2<<0):1 = 16:1 (not used in 'Hoshimi') #04 32:4 = 8 // (1<<2<<0):2 = 4:2 priority #13 #05 12:2 = 6 // (2<<2<<0):2 = 8:2 priority #09 #06 24:4 = 6 // (3<<2<<0):2 = 12:2 priority #05 #07 16:3 = 5.3 // (4<<2<<0):2 = 16:2 priority #03 #08 4:1 = 4 // (1<<2<<0):3 = 4:3 priority #15 #09 8:2 = 4 // (2<<2<<0):3 = 8:3 priority #12 #10 12:3 = 4 // (3<<2<<0):3 = 12:3 priority #10 #11 16:4 = 4 // (4<<2<<0):3 = 16:3 priority #07 #12 8:3 = 2.6 // (1<<2<<1):4 = 8:4 priority #14 (not used in 'Hoshimi*') #13 4:2 = 2 // (2<<2<<1):4 = 16:4 priority #11 #14 8:4 = 2 // (3<<2<<1):4 = 24:4 priority #06 #15 4:3 = 1.6 // (4<<2<<1):4 = 32:4 priority #04 // In 'Hoshimi' two bit combinations were unexploited, in 'Hoshimikou' one bit combination was unexploited, in 'Lexx' none is left. /* // Branchfull [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); if ( (DWORDtrio & 0x0F) == 0x0C ) { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(srcLOCAL+1) ), 16*2); // Hard lesson: XMM and YMM are not to be used together. #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(srcLOCAL+1) ), retLOCAL ); #endif retLOCAL+= (DWORDtrio>>4); srcLOCAL+= (DWORDtrio>>4)+1; } else { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), 16*2); #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), retLOCAL ); #endif srcLOCAL+= 1+(DWORDtrio&0x03); // 4|3|2|1 retLOCAL+= ((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2); // 4/8/12/16/24/32 } // Branchfull ] */ // Branchless [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); Flag=!((DWORDtrio & 0x0F)-0x0C); // In here Flag=0|1 FlagMASKnegated= Flag - 1; // -1|0 FlagMASK= ~FlagMASKnegated; #ifdef _N_YMM // SlowCopy256bit( (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), retLOCAL); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: SlowCopy256bit( (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), retLOCAL); #endif #ifdef _N_GP // memcpy(retLOCAL, (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), 16*2); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: memcpy(retLOCAL, (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), 16*2); #endif retLOCAL+= ((uint64_t)((DWORDtrio>>4))&FlagMASK) + ((uint64_t)(((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2))&FlagMASKnegated) ; srcLOCAL+= ((uint64_t)((DWORDtrio>>4)+1)&FlagMASK) + ((uint64_t)(1+(DWORDtrio&0x03))&FlagMASKnegated) ; // Branchless ] } //printf("\nloopcounter=%d\n",loopcounter); // loopcounter=29763921 for 'Autobiography_411-ebooks_Collection.tar.Lexx.Nakamichi' return (uint64_t)(retLOCAL - ret); } uint64_t Decompress025 (unsigned char* ret, unsigned char* src, uint64_t srcSize) { unsigned char* retLOCAL = ret; unsigned char* srcLOCAL = src; unsigned char* srcEndLOCAL = src+srcSize; unsigned int DWORDtrio; //unsigned int Flag; // This line made my hair white! For some reason it should be 64bit, otherwise it makes 'FlagMASKnegated' 32bit during 'FlagMASKnegated= Flag - 1; // -1|0' uint64_t Flag; uint64_t FlagMASK; //= 0xFFFFFFFFFFFFFFFF; uint64_t FlagMASKnegated; //=0x0000000000000000; //int loopcounter=0; while (srcLOCAL < srcEndLOCAL) { //loopcounter++; DWORDtrio = *(unsigned int*)srcLOCAL; //#ifndef _N_GP //#ifdef _N_prefetch_4096 // _mm_prefetch((char*)(srcLOCAL + 64*64), _MM_HINT_T0); //#endif //#endif // |1stLSB |2ndLSB |3rdLSB | // ------------------------------- // |OO|LL|xxxx|xxxxxxxx|xxxxxx|xx| // ------------------------------- // [1bit 16bit] 24bit] // OOLL = 0011 means Literal // OO = 00b MatchOffset, 0xFFFFFFFF>>(3-OO), 1 bytes long i.e. Sliding Window is 1*8-LL-OO=(1+OO)*8-4=04 or 16B // OO = 01b MatchOffset, 0xFFFFFFFF>>(3-OO), 2 bytes long i.e. Sliding Window is 2*8-LL-OO=(1+OO)*8-4=12 or 4KB // OO = 10b MatchOffset, 0xFFFFFFFF>>(3-OO), 3 bytes long i.e. Sliding Window is 3*8-LL-OO=(1+OO)*8-4=20 or 1MB // OO = 11b MatchOffset, 0xFFFFFFFF>>(3-OO), 4 bytes long i.e. Sliding Window is 4*8-LL-OO=(1+OO)*8-4=28 or 256MB // LL = 00b means 04/08/12 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 01b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 10b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 11b means 08/16/24/32 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // (1<<2<<0):1 = 4:1 priority #08 #01 12:1 = 12 // (2<<2<<0):1 = 8:1 priority #02 #02 8:1 = 8 // (3<<2<<0):1 = 12:1 priority #01 #03 16:2 = 8 // (4<<2<<0):1 = 16:1 (not used in 'Hoshimi') #04 32:4 = 8 // (1<<2<<0):2 = 4:2 priority #13 #05 12:2 = 6 // (2<<2<<0):2 = 8:2 priority #09 #06 24:4 = 6 // (3<<2<<0):2 = 12:2 priority #05 #07 16:3 = 5.3 // (4<<2<<0):2 = 16:2 priority #03 #08 4:1 = 4 // (1<<2<<0):3 = 4:3 priority #15 #09 8:2 = 4 // (2<<2<<0):3 = 8:3 priority #12 #10 12:3 = 4 // (3<<2<<0):3 = 12:3 priority #10 #11 16:4 = 4 // (4<<2<<0):3 = 16:3 priority #07 #12 8:3 = 2.6 // (1<<2<<1):4 = 8:4 priority #14 (not used in 'Hoshimi*') #13 4:2 = 2 // (2<<2<<1):4 = 16:4 priority #11 #14 8:4 = 2 // (3<<2<<1):4 = 24:4 priority #06 #15 4:3 = 1.6 // (4<<2<<1):4 = 32:4 priority #04 // In 'Hoshimi' two bit combinations were unexploited, in 'Hoshimikou' one bit combination was unexploited, in 'Lexx' none is left. /* // Branchfull [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); if ( (DWORDtrio & 0x0F) == 0x0C ) { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(srcLOCAL+1) ), 16*2); // Hard lesson: XMM and YMM are not to be used together. #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(srcLOCAL+1) ), retLOCAL ); #endif retLOCAL+= (DWORDtrio>>4); srcLOCAL+= (DWORDtrio>>4)+1; } else { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), 16*2); #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), retLOCAL ); #endif srcLOCAL+= 1+(DWORDtrio&0x03); // 4|3|2|1 retLOCAL+= ((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2); // 4/8/12/16/24/32 } // Branchfull ] */ // Branchless [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); Flag=!((DWORDtrio & 0x0F)-0x0C); // In here Flag=0|1 FlagMASKnegated= Flag - 1; // -1|0 FlagMASK= ~FlagMASKnegated; #ifdef _N_YMM // SlowCopy256bit( (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), retLOCAL); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: SlowCopy256bit( (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), retLOCAL); #endif #ifdef _N_GP // memcpy(retLOCAL, (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), 16*2); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: memcpy(retLOCAL, (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), 16*2); #endif retLOCAL+= ((uint64_t)((DWORDtrio>>4))&FlagMASK) + ((uint64_t)(((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2))&FlagMASKnegated) ; srcLOCAL+= ((uint64_t)((DWORDtrio>>4)+1)&FlagMASK) + ((uint64_t)(1+(DWORDtrio&0x03))&FlagMASKnegated) ; // Branchless ] } //printf("\nloopcounter=%d\n",loopcounter); // loopcounter=29763921 for 'Autobiography_411-ebooks_Collection.tar.Lexx.Nakamichi' return (uint64_t)(retLOCAL - ret); } uint64_t Decompress026 (unsigned char* ret, unsigned char* src, uint64_t srcSize) { unsigned char* retLOCAL = ret; unsigned char* srcLOCAL = src; unsigned char* srcEndLOCAL = src+srcSize; unsigned int DWORDtrio; //unsigned int Flag; // This line made my hair white! For some reason it should be 64bit, otherwise it makes 'FlagMASKnegated' 32bit during 'FlagMASKnegated= Flag - 1; // -1|0' uint64_t Flag; uint64_t FlagMASK; //= 0xFFFFFFFFFFFFFFFF; uint64_t FlagMASKnegated; //=0x0000000000000000; //int loopcounter=0; while (srcLOCAL < srcEndLOCAL) { //loopcounter++; DWORDtrio = *(unsigned int*)srcLOCAL; //#ifndef _N_GP //#ifdef _N_prefetch_4096 // _mm_prefetch((char*)(srcLOCAL + 64*64), _MM_HINT_T0); //#endif //#endif // |1stLSB |2ndLSB |3rdLSB | // ------------------------------- // |OO|LL|xxxx|xxxxxxxx|xxxxxx|xx| // ------------------------------- // [1bit 16bit] 24bit] // OOLL = 0011 means Literal // OO = 00b MatchOffset, 0xFFFFFFFF>>(3-OO), 1 bytes long i.e. Sliding Window is 1*8-LL-OO=(1+OO)*8-4=04 or 16B // OO = 01b MatchOffset, 0xFFFFFFFF>>(3-OO), 2 bytes long i.e. Sliding Window is 2*8-LL-OO=(1+OO)*8-4=12 or 4KB // OO = 10b MatchOffset, 0xFFFFFFFF>>(3-OO), 3 bytes long i.e. Sliding Window is 3*8-LL-OO=(1+OO)*8-4=20 or 1MB // OO = 11b MatchOffset, 0xFFFFFFFF>>(3-OO), 4 bytes long i.e. Sliding Window is 4*8-LL-OO=(1+OO)*8-4=28 or 256MB // LL = 00b means 04/08/12 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 01b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 10b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 11b means 08/16/24/32 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // (1<<2<<0):1 = 4:1 priority #08 #01 12:1 = 12 // (2<<2<<0):1 = 8:1 priority #02 #02 8:1 = 8 // (3<<2<<0):1 = 12:1 priority #01 #03 16:2 = 8 // (4<<2<<0):1 = 16:1 (not used in 'Hoshimi') #04 32:4 = 8 // (1<<2<<0):2 = 4:2 priority #13 #05 12:2 = 6 // (2<<2<<0):2 = 8:2 priority #09 #06 24:4 = 6 // (3<<2<<0):2 = 12:2 priority #05 #07 16:3 = 5.3 // (4<<2<<0):2 = 16:2 priority #03 #08 4:1 = 4 // (1<<2<<0):3 = 4:3 priority #15 #09 8:2 = 4 // (2<<2<<0):3 = 8:3 priority #12 #10 12:3 = 4 // (3<<2<<0):3 = 12:3 priority #10 #11 16:4 = 4 // (4<<2<<0):3 = 16:3 priority #07 #12 8:3 = 2.6 // (1<<2<<1):4 = 8:4 priority #14 (not used in 'Hoshimi*') #13 4:2 = 2 // (2<<2<<1):4 = 16:4 priority #11 #14 8:4 = 2 // (3<<2<<1):4 = 24:4 priority #06 #15 4:3 = 1.6 // (4<<2<<1):4 = 32:4 priority #04 // In 'Hoshimi' two bit combinations were unexploited, in 'Hoshimikou' one bit combination was unexploited, in 'Lexx' none is left. /* // Branchfull [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); if ( (DWORDtrio & 0x0F) == 0x0C ) { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(srcLOCAL+1) ), 16*2); // Hard lesson: XMM and YMM are not to be used together. #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(srcLOCAL+1) ), retLOCAL ); #endif retLOCAL+= (DWORDtrio>>4); srcLOCAL+= (DWORDtrio>>4)+1; } else { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), 16*2); #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), retLOCAL ); #endif srcLOCAL+= 1+(DWORDtrio&0x03); // 4|3|2|1 retLOCAL+= ((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2); // 4/8/12/16/24/32 } // Branchfull ] */ // Branchless [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); Flag=!((DWORDtrio & 0x0F)-0x0C); // In here Flag=0|1 FlagMASKnegated= Flag - 1; // -1|0 FlagMASK= ~FlagMASKnegated; #ifdef _N_YMM // SlowCopy256bit( (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), retLOCAL); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: SlowCopy256bit( (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), retLOCAL); #endif #ifdef _N_GP // memcpy(retLOCAL, (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), 16*2); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: memcpy(retLOCAL, (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), 16*2); #endif retLOCAL+= ((uint64_t)((DWORDtrio>>4))&FlagMASK) + ((uint64_t)(((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2))&FlagMASKnegated) ; srcLOCAL+= ((uint64_t)((DWORDtrio>>4)+1)&FlagMASK) + ((uint64_t)(1+(DWORDtrio&0x03))&FlagMASKnegated) ; // Branchless ] } //printf("\nloopcounter=%d\n",loopcounter); // loopcounter=29763921 for 'Autobiography_411-ebooks_Collection.tar.Lexx.Nakamichi' return (uint64_t)(retLOCAL - ret); } uint64_t Decompress027 (unsigned char* ret, unsigned char* src, uint64_t srcSize) { unsigned char* retLOCAL = ret; unsigned char* srcLOCAL = src; unsigned char* srcEndLOCAL = src+srcSize; unsigned int DWORDtrio; //unsigned int Flag; // This line made my hair white! For some reason it should be 64bit, otherwise it makes 'FlagMASKnegated' 32bit during 'FlagMASKnegated= Flag - 1; // -1|0' uint64_t Flag; uint64_t FlagMASK; //= 0xFFFFFFFFFFFFFFFF; uint64_t FlagMASKnegated; //=0x0000000000000000; //int loopcounter=0; while (srcLOCAL < srcEndLOCAL) { //loopcounter++; DWORDtrio = *(unsigned int*)srcLOCAL; //#ifndef _N_GP //#ifdef _N_prefetch_4096 // _mm_prefetch((char*)(srcLOCAL + 64*64), _MM_HINT_T0); //#endif //#endif // |1stLSB |2ndLSB |3rdLSB | // ------------------------------- // |OO|LL|xxxx|xxxxxxxx|xxxxxx|xx| // ------------------------------- // [1bit 16bit] 24bit] // OOLL = 0011 means Literal // OO = 00b MatchOffset, 0xFFFFFFFF>>(3-OO), 1 bytes long i.e. Sliding Window is 1*8-LL-OO=(1+OO)*8-4=04 or 16B // OO = 01b MatchOffset, 0xFFFFFFFF>>(3-OO), 2 bytes long i.e. Sliding Window is 2*8-LL-OO=(1+OO)*8-4=12 or 4KB // OO = 10b MatchOffset, 0xFFFFFFFF>>(3-OO), 3 bytes long i.e. Sliding Window is 3*8-LL-OO=(1+OO)*8-4=20 or 1MB // OO = 11b MatchOffset, 0xFFFFFFFF>>(3-OO), 4 bytes long i.e. Sliding Window is 4*8-LL-OO=(1+OO)*8-4=28 or 256MB // LL = 00b means 04/08/12 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 01b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 10b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 11b means 08/16/24/32 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // (1<<2<<0):1 = 4:1 priority #08 #01 12:1 = 12 // (2<<2<<0):1 = 8:1 priority #02 #02 8:1 = 8 // (3<<2<<0):1 = 12:1 priority #01 #03 16:2 = 8 // (4<<2<<0):1 = 16:1 (not used in 'Hoshimi') #04 32:4 = 8 // (1<<2<<0):2 = 4:2 priority #13 #05 12:2 = 6 // (2<<2<<0):2 = 8:2 priority #09 #06 24:4 = 6 // (3<<2<<0):2 = 12:2 priority #05 #07 16:3 = 5.3 // (4<<2<<0):2 = 16:2 priority #03 #08 4:1 = 4 // (1<<2<<0):3 = 4:3 priority #15 #09 8:2 = 4 // (2<<2<<0):3 = 8:3 priority #12 #10 12:3 = 4 // (3<<2<<0):3 = 12:3 priority #10 #11 16:4 = 4 // (4<<2<<0):3 = 16:3 priority #07 #12 8:3 = 2.6 // (1<<2<<1):4 = 8:4 priority #14 (not used in 'Hoshimi*') #13 4:2 = 2 // (2<<2<<1):4 = 16:4 priority #11 #14 8:4 = 2 // (3<<2<<1):4 = 24:4 priority #06 #15 4:3 = 1.6 // (4<<2<<1):4 = 32:4 priority #04 // In 'Hoshimi' two bit combinations were unexploited, in 'Hoshimikou' one bit combination was unexploited, in 'Lexx' none is left. /* // Branchfull [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); if ( (DWORDtrio & 0x0F) == 0x0C ) { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(srcLOCAL+1) ), 16*2); // Hard lesson: XMM and YMM are not to be used together. #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(srcLOCAL+1) ), retLOCAL ); #endif retLOCAL+= (DWORDtrio>>4); srcLOCAL+= (DWORDtrio>>4)+1; } else { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), 16*2); #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), retLOCAL ); #endif srcLOCAL+= 1+(DWORDtrio&0x03); // 4|3|2|1 retLOCAL+= ((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2); // 4/8/12/16/24/32 } // Branchfull ] */ // Branchless [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); Flag=!((DWORDtrio & 0x0F)-0x0C); // In here Flag=0|1 FlagMASKnegated= Flag - 1; // -1|0 FlagMASK= ~FlagMASKnegated; #ifdef _N_YMM // SlowCopy256bit( (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), retLOCAL); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: SlowCopy256bit( (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), retLOCAL); #endif #ifdef _N_GP // memcpy(retLOCAL, (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), 16*2); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: memcpy(retLOCAL, (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), 16*2); #endif retLOCAL+= ((uint64_t)((DWORDtrio>>4))&FlagMASK) + ((uint64_t)(((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2))&FlagMASKnegated) ; srcLOCAL+= ((uint64_t)((DWORDtrio>>4)+1)&FlagMASK) + ((uint64_t)(1+(DWORDtrio&0x03))&FlagMASKnegated) ; // Branchless ] } //printf("\nloopcounter=%d\n",loopcounter); // loopcounter=29763921 for 'Autobiography_411-ebooks_Collection.tar.Lexx.Nakamichi' return (uint64_t)(retLOCAL - ret); } uint64_t Decompress028 (unsigned char* ret, unsigned char* src, uint64_t srcSize) { unsigned char* retLOCAL = ret; unsigned char* srcLOCAL = src; unsigned char* srcEndLOCAL = src+srcSize; unsigned int DWORDtrio; //unsigned int Flag; // This line made my hair white! For some reason it should be 64bit, otherwise it makes 'FlagMASKnegated' 32bit during 'FlagMASKnegated= Flag - 1; // -1|0' uint64_t Flag; uint64_t FlagMASK; //= 0xFFFFFFFFFFFFFFFF; uint64_t FlagMASKnegated; //=0x0000000000000000; //int loopcounter=0; while (srcLOCAL < srcEndLOCAL) { //loopcounter++; DWORDtrio = *(unsigned int*)srcLOCAL; //#ifndef _N_GP //#ifdef _N_prefetch_4096 // _mm_prefetch((char*)(srcLOCAL + 64*64), _MM_HINT_T0); //#endif //#endif // |1stLSB |2ndLSB |3rdLSB | // ------------------------------- // |OO|LL|xxxx|xxxxxxxx|xxxxxx|xx| // ------------------------------- // [1bit 16bit] 24bit] // OOLL = 0011 means Literal // OO = 00b MatchOffset, 0xFFFFFFFF>>(3-OO), 1 bytes long i.e. Sliding Window is 1*8-LL-OO=(1+OO)*8-4=04 or 16B // OO = 01b MatchOffset, 0xFFFFFFFF>>(3-OO), 2 bytes long i.e. Sliding Window is 2*8-LL-OO=(1+OO)*8-4=12 or 4KB // OO = 10b MatchOffset, 0xFFFFFFFF>>(3-OO), 3 bytes long i.e. Sliding Window is 3*8-LL-OO=(1+OO)*8-4=20 or 1MB // OO = 11b MatchOffset, 0xFFFFFFFF>>(3-OO), 4 bytes long i.e. Sliding Window is 4*8-LL-OO=(1+OO)*8-4=28 or 256MB // LL = 00b means 04/08/12 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 01b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 10b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 11b means 08/16/24/32 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // (1<<2<<0):1 = 4:1 priority #08 #01 12:1 = 12 // (2<<2<<0):1 = 8:1 priority #02 #02 8:1 = 8 // (3<<2<<0):1 = 12:1 priority #01 #03 16:2 = 8 // (4<<2<<0):1 = 16:1 (not used in 'Hoshimi') #04 32:4 = 8 // (1<<2<<0):2 = 4:2 priority #13 #05 12:2 = 6 // (2<<2<<0):2 = 8:2 priority #09 #06 24:4 = 6 // (3<<2<<0):2 = 12:2 priority #05 #07 16:3 = 5.3 // (4<<2<<0):2 = 16:2 priority #03 #08 4:1 = 4 // (1<<2<<0):3 = 4:3 priority #15 #09 8:2 = 4 // (2<<2<<0):3 = 8:3 priority #12 #10 12:3 = 4 // (3<<2<<0):3 = 12:3 priority #10 #11 16:4 = 4 // (4<<2<<0):3 = 16:3 priority #07 #12 8:3 = 2.6 // (1<<2<<1):4 = 8:4 priority #14 (not used in 'Hoshimi*') #13 4:2 = 2 // (2<<2<<1):4 = 16:4 priority #11 #14 8:4 = 2 // (3<<2<<1):4 = 24:4 priority #06 #15 4:3 = 1.6 // (4<<2<<1):4 = 32:4 priority #04 // In 'Hoshimi' two bit combinations were unexploited, in 'Hoshimikou' one bit combination was unexploited, in 'Lexx' none is left. /* // Branchfull [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); if ( (DWORDtrio & 0x0F) == 0x0C ) { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(srcLOCAL+1) ), 16*2); // Hard lesson: XMM and YMM are not to be used together. #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(srcLOCAL+1) ), retLOCAL ); #endif retLOCAL+= (DWORDtrio>>4); srcLOCAL+= (DWORDtrio>>4)+1; } else { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), 16*2); #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), retLOCAL ); #endif srcLOCAL+= 1+(DWORDtrio&0x03); // 4|3|2|1 retLOCAL+= ((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2); // 4/8/12/16/24/32 } // Branchfull ] */ // Branchless [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); Flag=!((DWORDtrio & 0x0F)-0x0C); // In here Flag=0|1 FlagMASKnegated= Flag - 1; // -1|0 FlagMASK= ~FlagMASKnegated; #ifdef _N_YMM // SlowCopy256bit( (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), retLOCAL); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: SlowCopy256bit( (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), retLOCAL); #endif #ifdef _N_GP // memcpy(retLOCAL, (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), 16*2); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: memcpy(retLOCAL, (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), 16*2); #endif retLOCAL+= ((uint64_t)((DWORDtrio>>4))&FlagMASK) + ((uint64_t)(((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2))&FlagMASKnegated) ; srcLOCAL+= ((uint64_t)((DWORDtrio>>4)+1)&FlagMASK) + ((uint64_t)(1+(DWORDtrio&0x03))&FlagMASKnegated) ; // Branchless ] } //printf("\nloopcounter=%d\n",loopcounter); // loopcounter=29763921 for 'Autobiography_411-ebooks_Collection.tar.Lexx.Nakamichi' return (uint64_t)(retLOCAL - ret); } uint64_t Decompress029 (unsigned char* ret, unsigned char* src, uint64_t srcSize) { unsigned char* retLOCAL = ret; unsigned char* srcLOCAL = src; unsigned char* srcEndLOCAL = src+srcSize; unsigned int DWORDtrio; //unsigned int Flag; // This line made my hair white! For some reason it should be 64bit, otherwise it makes 'FlagMASKnegated' 32bit during 'FlagMASKnegated= Flag - 1; // -1|0' uint64_t Flag; uint64_t FlagMASK; //= 0xFFFFFFFFFFFFFFFF; uint64_t FlagMASKnegated; //=0x0000000000000000; //int loopcounter=0; while (srcLOCAL < srcEndLOCAL) { //loopcounter++; DWORDtrio = *(unsigned int*)srcLOCAL; //#ifndef _N_GP //#ifdef _N_prefetch_4096 // _mm_prefetch((char*)(srcLOCAL + 64*64), _MM_HINT_T0); //#endif //#endif // |1stLSB |2ndLSB |3rdLSB | // ------------------------------- // |OO|LL|xxxx|xxxxxxxx|xxxxxx|xx| // ------------------------------- // [1bit 16bit] 24bit] // OOLL = 0011 means Literal // OO = 00b MatchOffset, 0xFFFFFFFF>>(3-OO), 1 bytes long i.e. Sliding Window is 1*8-LL-OO=(1+OO)*8-4=04 or 16B // OO = 01b MatchOffset, 0xFFFFFFFF>>(3-OO), 2 bytes long i.e. Sliding Window is 2*8-LL-OO=(1+OO)*8-4=12 or 4KB // OO = 10b MatchOffset, 0xFFFFFFFF>>(3-OO), 3 bytes long i.e. Sliding Window is 3*8-LL-OO=(1+OO)*8-4=20 or 1MB // OO = 11b MatchOffset, 0xFFFFFFFF>>(3-OO), 4 bytes long i.e. Sliding Window is 4*8-LL-OO=(1+OO)*8-4=28 or 256MB // LL = 00b means 04/08/12 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 01b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 10b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 11b means 08/16/24/32 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // (1<<2<<0):1 = 4:1 priority #08 #01 12:1 = 12 // (2<<2<<0):1 = 8:1 priority #02 #02 8:1 = 8 // (3<<2<<0):1 = 12:1 priority #01 #03 16:2 = 8 // (4<<2<<0):1 = 16:1 (not used in 'Hoshimi') #04 32:4 = 8 // (1<<2<<0):2 = 4:2 priority #13 #05 12:2 = 6 // (2<<2<<0):2 = 8:2 priority #09 #06 24:4 = 6 // (3<<2<<0):2 = 12:2 priority #05 #07 16:3 = 5.3 // (4<<2<<0):2 = 16:2 priority #03 #08 4:1 = 4 // (1<<2<<0):3 = 4:3 priority #15 #09 8:2 = 4 // (2<<2<<0):3 = 8:3 priority #12 #10 12:3 = 4 // (3<<2<<0):3 = 12:3 priority #10 #11 16:4 = 4 // (4<<2<<0):3 = 16:3 priority #07 #12 8:3 = 2.6 // (1<<2<<1):4 = 8:4 priority #14 (not used in 'Hoshimi*') #13 4:2 = 2 // (2<<2<<1):4 = 16:4 priority #11 #14 8:4 = 2 // (3<<2<<1):4 = 24:4 priority #06 #15 4:3 = 1.6 // (4<<2<<1):4 = 32:4 priority #04 // In 'Hoshimi' two bit combinations were unexploited, in 'Hoshimikou' one bit combination was unexploited, in 'Lexx' none is left. /* // Branchfull [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); if ( (DWORDtrio & 0x0F) == 0x0C ) { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(srcLOCAL+1) ), 16*2); // Hard lesson: XMM and YMM are not to be used together. #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(srcLOCAL+1) ), retLOCAL ); #endif retLOCAL+= (DWORDtrio>>4); srcLOCAL+= (DWORDtrio>>4)+1; } else { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), 16*2); #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), retLOCAL ); #endif srcLOCAL+= 1+(DWORDtrio&0x03); // 4|3|2|1 retLOCAL+= ((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2); // 4/8/12/16/24/32 } // Branchfull ] */ // Branchless [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); Flag=!((DWORDtrio & 0x0F)-0x0C); // In here Flag=0|1 FlagMASKnegated= Flag - 1; // -1|0 FlagMASK= ~FlagMASKnegated; #ifdef _N_YMM // SlowCopy256bit( (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), retLOCAL); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: SlowCopy256bit( (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), retLOCAL); #endif #ifdef _N_GP // memcpy(retLOCAL, (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), 16*2); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: memcpy(retLOCAL, (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), 16*2); #endif retLOCAL+= ((uint64_t)((DWORDtrio>>4))&FlagMASK) + ((uint64_t)(((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2))&FlagMASKnegated) ; srcLOCAL+= ((uint64_t)((DWORDtrio>>4)+1)&FlagMASK) + ((uint64_t)(1+(DWORDtrio&0x03))&FlagMASKnegated) ; // Branchless ] } //printf("\nloopcounter=%d\n",loopcounter); // loopcounter=29763921 for 'Autobiography_411-ebooks_Collection.tar.Lexx.Nakamichi' return (uint64_t)(retLOCAL - ret); } uint64_t Decompress030 (unsigned char* ret, unsigned char* src, uint64_t srcSize) { unsigned char* retLOCAL = ret; unsigned char* srcLOCAL = src; unsigned char* srcEndLOCAL = src+srcSize; unsigned int DWORDtrio; //unsigned int Flag; // This line made my hair white! For some reason it should be 64bit, otherwise it makes 'FlagMASKnegated' 32bit during 'FlagMASKnegated= Flag - 1; // -1|0' uint64_t Flag; uint64_t FlagMASK; //= 0xFFFFFFFFFFFFFFFF; uint64_t FlagMASKnegated; //=0x0000000000000000; //int loopcounter=0; while (srcLOCAL < srcEndLOCAL) { //loopcounter++; DWORDtrio = *(unsigned int*)srcLOCAL; //#ifndef _N_GP //#ifdef _N_prefetch_4096 // _mm_prefetch((char*)(srcLOCAL + 64*64), _MM_HINT_T0); //#endif //#endif // |1stLSB |2ndLSB |3rdLSB | // ------------------------------- // |OO|LL|xxxx|xxxxxxxx|xxxxxx|xx| // ------------------------------- // [1bit 16bit] 24bit] // OOLL = 0011 means Literal // OO = 00b MatchOffset, 0xFFFFFFFF>>(3-OO), 1 bytes long i.e. Sliding Window is 1*8-LL-OO=(1+OO)*8-4=04 or 16B // OO = 01b MatchOffset, 0xFFFFFFFF>>(3-OO), 2 bytes long i.e. Sliding Window is 2*8-LL-OO=(1+OO)*8-4=12 or 4KB // OO = 10b MatchOffset, 0xFFFFFFFF>>(3-OO), 3 bytes long i.e. Sliding Window is 3*8-LL-OO=(1+OO)*8-4=20 or 1MB // OO = 11b MatchOffset, 0xFFFFFFFF>>(3-OO), 4 bytes long i.e. Sliding Window is 4*8-LL-OO=(1+OO)*8-4=28 or 256MB // LL = 00b means 04/08/12 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 01b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 10b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 11b means 08/16/24/32 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // (1<<2<<0):1 = 4:1 priority #08 #01 12:1 = 12 // (2<<2<<0):1 = 8:1 priority #02 #02 8:1 = 8 // (3<<2<<0):1 = 12:1 priority #01 #03 16:2 = 8 // (4<<2<<0):1 = 16:1 (not used in 'Hoshimi') #04 32:4 = 8 // (1<<2<<0):2 = 4:2 priority #13 #05 12:2 = 6 // (2<<2<<0):2 = 8:2 priority #09 #06 24:4 = 6 // (3<<2<<0):2 = 12:2 priority #05 #07 16:3 = 5.3 // (4<<2<<0):2 = 16:2 priority #03 #08 4:1 = 4 // (1<<2<<0):3 = 4:3 priority #15 #09 8:2 = 4 // (2<<2<<0):3 = 8:3 priority #12 #10 12:3 = 4 // (3<<2<<0):3 = 12:3 priority #10 #11 16:4 = 4 // (4<<2<<0):3 = 16:3 priority #07 #12 8:3 = 2.6 // (1<<2<<1):4 = 8:4 priority #14 (not used in 'Hoshimi*') #13 4:2 = 2 // (2<<2<<1):4 = 16:4 priority #11 #14 8:4 = 2 // (3<<2<<1):4 = 24:4 priority #06 #15 4:3 = 1.6 // (4<<2<<1):4 = 32:4 priority #04 // In 'Hoshimi' two bit combinations were unexploited, in 'Hoshimikou' one bit combination was unexploited, in 'Lexx' none is left. /* // Branchfull [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); if ( (DWORDtrio & 0x0F) == 0x0C ) { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(srcLOCAL+1) ), 16*2); // Hard lesson: XMM and YMM are not to be used together. #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(srcLOCAL+1) ), retLOCAL ); #endif retLOCAL+= (DWORDtrio>>4); srcLOCAL+= (DWORDtrio>>4)+1; } else { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), 16*2); #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), retLOCAL ); #endif srcLOCAL+= 1+(DWORDtrio&0x03); // 4|3|2|1 retLOCAL+= ((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2); // 4/8/12/16/24/32 } // Branchfull ] */ // Branchless [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); Flag=!((DWORDtrio & 0x0F)-0x0C); // In here Flag=0|1 FlagMASKnegated= Flag - 1; // -1|0 FlagMASK= ~FlagMASKnegated; #ifdef _N_YMM // SlowCopy256bit( (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), retLOCAL); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: SlowCopy256bit( (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), retLOCAL); #endif #ifdef _N_GP // memcpy(retLOCAL, (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), 16*2); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: memcpy(retLOCAL, (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), 16*2); #endif retLOCAL+= ((uint64_t)((DWORDtrio>>4))&FlagMASK) + ((uint64_t)(((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2))&FlagMASKnegated) ; srcLOCAL+= ((uint64_t)((DWORDtrio>>4)+1)&FlagMASK) + ((uint64_t)(1+(DWORDtrio&0x03))&FlagMASKnegated) ; // Branchless ] } //printf("\nloopcounter=%d\n",loopcounter); // loopcounter=29763921 for 'Autobiography_411-ebooks_Collection.tar.Lexx.Nakamichi' return (uint64_t)(retLOCAL - ret); } uint64_t Decompress031 (unsigned char* ret, unsigned char* src, uint64_t srcSize) { unsigned char* retLOCAL = ret; unsigned char* srcLOCAL = src; unsigned char* srcEndLOCAL = src+srcSize; unsigned int DWORDtrio; //unsigned int Flag; // This line made my hair white! For some reason it should be 64bit, otherwise it makes 'FlagMASKnegated' 32bit during 'FlagMASKnegated= Flag - 1; // -1|0' uint64_t Flag; uint64_t FlagMASK; //= 0xFFFFFFFFFFFFFFFF; uint64_t FlagMASKnegated; //=0x0000000000000000; //int loopcounter=0; while (srcLOCAL < srcEndLOCAL) { //loopcounter++; DWORDtrio = *(unsigned int*)srcLOCAL; //#ifndef _N_GP //#ifdef _N_prefetch_4096 // _mm_prefetch((char*)(srcLOCAL + 64*64), _MM_HINT_T0); //#endif //#endif // |1stLSB |2ndLSB |3rdLSB | // ------------------------------- // |OO|LL|xxxx|xxxxxxxx|xxxxxx|xx| // ------------------------------- // [1bit 16bit] 24bit] // OOLL = 0011 means Literal // OO = 00b MatchOffset, 0xFFFFFFFF>>(3-OO), 1 bytes long i.e. Sliding Window is 1*8-LL-OO=(1+OO)*8-4=04 or 16B // OO = 01b MatchOffset, 0xFFFFFFFF>>(3-OO), 2 bytes long i.e. Sliding Window is 2*8-LL-OO=(1+OO)*8-4=12 or 4KB // OO = 10b MatchOffset, 0xFFFFFFFF>>(3-OO), 3 bytes long i.e. Sliding Window is 3*8-LL-OO=(1+OO)*8-4=20 or 1MB // OO = 11b MatchOffset, 0xFFFFFFFF>>(3-OO), 4 bytes long i.e. Sliding Window is 4*8-LL-OO=(1+OO)*8-4=28 or 256MB // LL = 00b means 04/08/12 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 01b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 10b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 11b means 08/16/24/32 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // (1<<2<<0):1 = 4:1 priority #08 #01 12:1 = 12 // (2<<2<<0):1 = 8:1 priority #02 #02 8:1 = 8 // (3<<2<<0):1 = 12:1 priority #01 #03 16:2 = 8 // (4<<2<<0):1 = 16:1 (not used in 'Hoshimi') #04 32:4 = 8 // (1<<2<<0):2 = 4:2 priority #13 #05 12:2 = 6 // (2<<2<<0):2 = 8:2 priority #09 #06 24:4 = 6 // (3<<2<<0):2 = 12:2 priority #05 #07 16:3 = 5.3 // (4<<2<<0):2 = 16:2 priority #03 #08 4:1 = 4 // (1<<2<<0):3 = 4:3 priority #15 #09 8:2 = 4 // (2<<2<<0):3 = 8:3 priority #12 #10 12:3 = 4 // (3<<2<<0):3 = 12:3 priority #10 #11 16:4 = 4 // (4<<2<<0):3 = 16:3 priority #07 #12 8:3 = 2.6 // (1<<2<<1):4 = 8:4 priority #14 (not used in 'Hoshimi*') #13 4:2 = 2 // (2<<2<<1):4 = 16:4 priority #11 #14 8:4 = 2 // (3<<2<<1):4 = 24:4 priority #06 #15 4:3 = 1.6 // (4<<2<<1):4 = 32:4 priority #04 // In 'Hoshimi' two bit combinations were unexploited, in 'Hoshimikou' one bit combination was unexploited, in 'Lexx' none is left. /* // Branchfull [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); if ( (DWORDtrio & 0x0F) == 0x0C ) { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(srcLOCAL+1) ), 16*2); // Hard lesson: XMM and YMM are not to be used together. #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(srcLOCAL+1) ), retLOCAL ); #endif retLOCAL+= (DWORDtrio>>4); srcLOCAL+= (DWORDtrio>>4)+1; } else { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), 16*2); #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), retLOCAL ); #endif srcLOCAL+= 1+(DWORDtrio&0x03); // 4|3|2|1 retLOCAL+= ((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2); // 4/8/12/16/24/32 } // Branchfull ] */ // Branchless [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); Flag=!((DWORDtrio & 0x0F)-0x0C); // In here Flag=0|1 FlagMASKnegated= Flag - 1; // -1|0 FlagMASK= ~FlagMASKnegated; #ifdef _N_YMM // SlowCopy256bit( (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), retLOCAL); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: SlowCopy256bit( (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), retLOCAL); #endif #ifdef _N_GP // memcpy(retLOCAL, (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), 16*2); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: memcpy(retLOCAL, (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), 16*2); #endif retLOCAL+= ((uint64_t)((DWORDtrio>>4))&FlagMASK) + ((uint64_t)(((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2))&FlagMASKnegated) ; srcLOCAL+= ((uint64_t)((DWORDtrio>>4)+1)&FlagMASK) + ((uint64_t)(1+(DWORDtrio&0x03))&FlagMASKnegated) ; // Branchless ] } //printf("\nloopcounter=%d\n",loopcounter); // loopcounter=29763921 for 'Autobiography_411-ebooks_Collection.tar.Lexx.Nakamichi' return (uint64_t)(retLOCAL - ret); } uint64_t Decompress032 (unsigned char* ret, unsigned char* src, uint64_t srcSize) { unsigned char* retLOCAL = ret; unsigned char* srcLOCAL = src; unsigned char* srcEndLOCAL = src+srcSize; unsigned int DWORDtrio; //unsigned int Flag; // This line made my hair white! For some reason it should be 64bit, otherwise it makes 'FlagMASKnegated' 32bit during 'FlagMASKnegated= Flag - 1; // -1|0' uint64_t Flag; uint64_t FlagMASK; //= 0xFFFFFFFFFFFFFFFF; uint64_t FlagMASKnegated; //=0x0000000000000000; //int loopcounter=0; while (srcLOCAL < srcEndLOCAL) { //loopcounter++; DWORDtrio = *(unsigned int*)srcLOCAL; //#ifndef _N_GP //#ifdef _N_prefetch_4096 // _mm_prefetch((char*)(srcLOCAL + 64*64), _MM_HINT_T0); //#endif //#endif // |1stLSB |2ndLSB |3rdLSB | // ------------------------------- // |OO|LL|xxxx|xxxxxxxx|xxxxxx|xx| // ------------------------------- // [1bit 16bit] 24bit] // OOLL = 0011 means Literal // OO = 00b MatchOffset, 0xFFFFFFFF>>(3-OO), 1 bytes long i.e. Sliding Window is 1*8-LL-OO=(1+OO)*8-4=04 or 16B // OO = 01b MatchOffset, 0xFFFFFFFF>>(3-OO), 2 bytes long i.e. Sliding Window is 2*8-LL-OO=(1+OO)*8-4=12 or 4KB // OO = 10b MatchOffset, 0xFFFFFFFF>>(3-OO), 3 bytes long i.e. Sliding Window is 3*8-LL-OO=(1+OO)*8-4=20 or 1MB // OO = 11b MatchOffset, 0xFFFFFFFF>>(3-OO), 4 bytes long i.e. Sliding Window is 4*8-LL-OO=(1+OO)*8-4=28 or 256MB // LL = 00b means 04/08/12 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 01b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 10b means 04/08/12/16 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // LL = 11b means 08/16/24/32 MatchLength, ((1+LL)<<2) << (1+OO)>>2) // (1<<2<<0):1 = 4:1 priority #08 #01 12:1 = 12 // (2<<2<<0):1 = 8:1 priority #02 #02 8:1 = 8 // (3<<2<<0):1 = 12:1 priority #01 #03 16:2 = 8 // (4<<2<<0):1 = 16:1 (not used in 'Hoshimi') #04 32:4 = 8 // (1<<2<<0):2 = 4:2 priority #13 #05 12:2 = 6 // (2<<2<<0):2 = 8:2 priority #09 #06 24:4 = 6 // (3<<2<<0):2 = 12:2 priority #05 #07 16:3 = 5.3 // (4<<2<<0):2 = 16:2 priority #03 #08 4:1 = 4 // (1<<2<<0):3 = 4:3 priority #15 #09 8:2 = 4 // (2<<2<<0):3 = 8:3 priority #12 #10 12:3 = 4 // (3<<2<<0):3 = 12:3 priority #10 #11 16:4 = 4 // (4<<2<<0):3 = 16:3 priority #07 #12 8:3 = 2.6 // (1<<2<<1):4 = 8:4 priority #14 (not used in 'Hoshimi*') #13 4:2 = 2 // (2<<2<<1):4 = 16:4 priority #11 #14 8:4 = 2 // (3<<2<<1):4 = 24:4 priority #06 #15 4:3 = 1.6 // (4<<2<<1):4 = 32:4 priority #04 // In 'Hoshimi' two bit combinations were unexploited, in 'Hoshimikou' one bit combination was unexploited, in 'Lexx' none is left. /* // Branchfull [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); if ( (DWORDtrio & 0x0F) == 0x0C ) { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(srcLOCAL+1) ), 16*2); // Hard lesson: XMM and YMM are not to be used together. #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(srcLOCAL+1) ), retLOCAL ); #endif retLOCAL+= (DWORDtrio>>4); srcLOCAL+= (DWORDtrio>>4)+1; } else { #ifdef _N_GP memcpy(retLOCAL, (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), 16*2); #endif #ifdef _N_YMM SlowCopy256bit( (const char *)( (uint64_t)(retLOCAL-(DWORDtrio>>4)) ), retLOCAL ); #endif srcLOCAL+= 1+(DWORDtrio&0x03); // 4|3|2|1 retLOCAL+= ((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2); // 4/8/12/16/24/32 } // Branchfull ] */ // Branchless [ DWORDtrio = DWORDtrio&( 0xFFFFFFFF >> ((3-(DWORDtrio & 0x03))<<3) ); Flag=!((DWORDtrio & 0x0F)-0x0C); // In here Flag=0|1 FlagMASKnegated= Flag - 1; // -1|0 FlagMASK= ~FlagMASKnegated; #ifdef _N_YMM // SlowCopy256bit( (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), retLOCAL); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: SlowCopy256bit( (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), retLOCAL); #endif #ifdef _N_GP // memcpy(retLOCAL, (const char *)( ((uint64_t)(srcLOCAL+1)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4))&FlagMASKnegated) ), 16*2); // Another (incompatible with Branchfull variant, though) way to avoid 'LEA' is to put the '+1' outside the FlagMASK but then the encoder has to count literals from zero in order to compensate '-((DWORDtrio>>4)-1) = -(DWORDtrio>>4)+1' within FlagMASKnegated: memcpy(retLOCAL, (const char *)( 1+ ((uint64_t)(srcLOCAL)&FlagMASK) + ((uint64_t)(retLOCAL-(DWORDtrio>>4)-1)&FlagMASKnegated) ), 16*2); #endif retLOCAL+= ((uint64_t)((DWORDtrio>>4))&FlagMASK) + ((uint64_t)(((1+((DWORDtrio>>2)&0x03))<<2) << ((1+(DWORDtrio&0x03))>>2))&FlagMASKnegated) ; srcLOCAL+= ((uint64_t)((DWORDtrio>>4)+1)&FlagMASK) + ((uint64_t)(1+(DWORDtrio&0x03))&FlagMASKnegated) ; // Branchless ] } //printf("\nloopcounter=%d\n",loopcounter); // loopcounter=29763921 for 'Autobiography_411-ebooks_Collection.tar.Lexx.Nakamichi' return (uint64_t)(retLOCAL - ret); } // Last change: 2015-Aug-27