// Copyleft Sanmayce, 2012 Dec 27.
// FLC_Fixed-Length-Chunker.c, revision 1.

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>

#define _WIN32_ENVIRONMENT_
//#define _POSIX_ENVIRONMENT_

#define MaxLineLength 960

#ifndef NULL
#ifdef __cplusplus
#define NULL 0
#else
#define NULL ((void*)0)
#endif
#endif

#ifndef FALSE
#define FALSE 0
#endif
#ifndef TRUE
#define TRUE 1
#endif

#ifndef false
#define false 0
#endif
#ifndef true
#define true 1
#endif

#if defined(_WIN32_ENVIRONMENT_)
#include <io.h> // needed for Windows' 'lseeki64' and 'telli64'
//Above line must be commented in order to compile with Intel C compiler: an error "can't find io.h" occurs.
#else
#endif /* defined(_WIN32_ENVIRONMENT_)  */

typedef unsigned char char_t;
typedef char_t *string;
char TAGfree[8] = "*@#^$|%";
typedef unsigned char boolean;
FILE *fp_outLOG;

#define KAZE_tolower(c) ( (((c) >= 'A') && ((c) <= 'Z')) ? ((c) - 'A' + 'a') : (c) )
#define KAZE_toupper(c) ( (((c) >= 'a') && ((c) <= 'z')) ? ((c) - 'a' + 'A') : (c) )

long maskGLOBALlen; // for speed up ((12415s - 755s)/755)*100% = 1544%
long nameGLOBALlen; 

long VIVA_IgorPavlov_invocations_global_counter = 0;
long WildGLOBALhits = 0, TotalLinesENC = 0;


void * memchrKAZE (
        const void * buf,
        const void * chr,
        unsigned long cnt
        )
{
        while ( cnt && (*(unsigned char *)buf != *(unsigned char*)chr) ) {
                buf = (unsigned char *)buf + 1;
                cnt--;
        }

        return(cnt ? (void *)buf : NULL);
}
//Exit:
//       returns pointer to first occurence of chr in buf
//       returns NULL if chr not found in the first cnt bytes


long KAZE_strlen (
        const char * str
        )
{
        const char *eos = str;

        while( *eos++ ) ;

        return( (int)(eos - str - 1) );
}
//_KAZE_strlen PROC NEAR
//; Line 225: const char *eos = str;
//        mov     ecx, DWORD PTR _str$[esp-4]
//        mov     eax, ecx
//$L725:
//; Line 227: while( *eos++ ) ;
//        mov     dl, BYTE PTR [eax]
//        inc     eax
//        test    dl, dl
//        jne     SHORT $L725
//; Line 229: return( (int)(eos - str - 1) );
//        sub     eax, ecx
//        dec     eax
//; Line 230
//        ret     0
//_KAZE_strlen ENDP


long KAZE_strlenLF (
        const char * str
        )
{
        const char *eos = str;
        char LFa[1];
        LFa[0] = 10; //BUG UNcrushed yet: for Windows 13 for POSIX 10
        while( *eos++ != LFa[0] ) ;

        return( (int)(eos - str - 1) );
}

// Kaze [

//       wildcard '*' any character(s) or empty,
//       wildcard '@'/'#' any character {or empty}/{and not empty},
//       wildcard '^'/'$' any ALPHA character {or empty}/{and not empty},
//       wildcard '|'/'%' any NON-ALPHA character {or empty}/{and not empty},
//       wildcard '+'/'~' any WORD {or empty}/{and not empty}.

// wildcard '*' any character(s) or empty,
// wildcard '@' any character or empty,
// wildcard '#' any character and not empty,
// wildcard '$' any ALPHA character and not empty,
// wildcard '%' any NON-ALPHA character and not empty.
// Note: Due to different line endings(CRLF in Windows; LF in UNIX)
//       you must add a '@' wildcard in place of CR: for example in
//       case of searching for '*.pdf' write '*.pdf@'.
// Pattern example: *%%take@%%$$@

static boolean EnhancedMaskTest_OrEmpty_AndNotEmpty(const char *mask, int maskPos, 
                                                 const char *name, int namePos)
{
  char maskChar;
  char c;
  //int maskLen = KAZE_strlen(mask) - maskPos;
  //int nameLen = KAZE_strlenLF(name) - namePos;
  // Above 2 lines are modified with GLOBAL variables for speed as follows:
  int maskLen = maskGLOBALlen - maskPos; // for speed up ((12415s - 755s)/755)*100% = 1544%
  int nameLen = nameGLOBALlen - namePos;
  if (maskLen == 0)
    if (nameLen == 0)
      return true;
    else
      return false;
  maskChar = mask[maskPos];
  if (maskChar == '@') // or empty    
  {
    /*
    if (EnhancedMaskTest_OrEmpty_AndNotEmpty(mask, maskPos + 1, name, namePos)) 
      return true;
    */
    if (EnhancedMaskTest_OrEmpty_AndNotEmpty(mask, maskPos + 1, name, namePos)) // KAZE: THIS LINE DECIDES whether 'or empty' or 'and not empty'
      return true;                                                  //       uncommented is 'or empty'
    if (nameLen == 0) 
      return false;
    return EnhancedMaskTest_OrEmpty_AndNotEmpty(mask,  maskPos + 1, name, namePos + 1);
  }
  else if(maskChar == '#') // and not empty
  {
    /*
    if (EnhancedMaskTest_OrEmpty_AndNotEmpty(mask, maskPos + 1, name, namePos)) 
      return true;
    */
    if (nameLen == 0) 
      return false;
    return EnhancedMaskTest_OrEmpty_AndNotEmpty(mask,  maskPos + 1, name, namePos + 1);
  }
//       wildcard '|'/'%' any NON-ALPHA character {or empty}/{and not empty},
  else if(maskChar == '|') // or empty AND NOT ALPHA
  {

    if (EnhancedMaskTest_OrEmpty_AndNotEmpty(mask, maskPos + 1, name, namePos)) 
      return true;

    c = name[namePos]; 
      if ( (KAZE_toupper(c) >= 'A') && (KAZE_toupper(c) <= 'Z') ) // Stupidly slow: make it faster ...
        return false;

    if (nameLen == 0)
      return false;
    return EnhancedMaskTest_OrEmpty_AndNotEmpty(mask,  maskPos + 1, name, namePos + 1);
  }
  else if(maskChar == '%') // and not empty AND NOT ALPHA
  {
    /*
    if (EnhancedMaskTest_OrEmpty_AndNotEmpty(mask, maskPos + 1, name, namePos)) 
      return true;
    */

    c = name[namePos]; 
      if ( (KAZE_toupper(c) >= 'A') && (KAZE_toupper(c) <= 'Z') ) // Stupidly slow: make it faster ...
        return false;

    if (nameLen == 0)
      return false;
    return EnhancedMaskTest_OrEmpty_AndNotEmpty(mask,  maskPos + 1, name, namePos + 1);
  }
//       wildcard '^'/'$' any ALPHA character {or empty}/{and not empty},
  else if(maskChar == '^') // or empty AND ALPHA
  {

    if (EnhancedMaskTest_OrEmpty_AndNotEmpty(mask, maskPos + 1, name, namePos)) 
      return true;

    c = name[namePos]; 
      if ( (KAZE_toupper(c) < 'A') || (KAZE_toupper(c) > 'Z') ) // Stupidly slow: make it faster ...
        return false;

    if (nameLen == 0)
      return false;
    return EnhancedMaskTest_OrEmpty_AndNotEmpty(mask,  maskPos + 1, name, namePos + 1);
  }
  else if(maskChar == '$') // and not empty AND ALPHA
  {
    /*
    if (EnhancedMaskTest_OrEmpty_AndNotEmpty(mask, maskPos + 1, name, namePos)) 
      return true;
    */

    c = name[namePos]; 
      if ( (KAZE_toupper(c) < 'A') || (KAZE_toupper(c) > 'Z') ) // Stupidly slow: make it faster ...
        return false;

    if (nameLen == 0)
      return false;
    return EnhancedMaskTest_OrEmpty_AndNotEmpty(mask,  maskPos + 1, name, namePos + 1);
  }
  else if(maskChar == '*')
  {
    if (EnhancedMaskTest_OrEmpty_AndNotEmpty(mask, maskPos + 1, name, namePos))
      return true;
    if (nameLen == 0) 
      return false;
    return EnhancedMaskTest_OrEmpty_AndNotEmpty(mask, maskPos, name, namePos + 1);
  }
  else
  {
    c = name[namePos];
    //if (maskChar != c)
      if (KAZE_toupper(maskChar) != KAZE_toupper(c))
        return false;
    return EnhancedMaskTest_OrEmpty_AndNotEmpty(mask,  maskPos + 1, name, namePos + 1);
  }
}


boolean CompareWildCardWithName(const char *mask, const char *name)
{
  boolean Txpbool;
  VIVA_IgorPavlov_invocations_global_counter++;

  maskGLOBALlen = KAZE_strlen(mask);
  nameGLOBALlen = KAZE_strlen(name);

  Txpbool = EnhancedMaskTest_OrEmpty_AndNotEmpty(mask, 0, name, 0);
  if (Txpbool) WildGLOBALhits++;
  return Txpbool;
}
// Above fragment(modified) is from wildcard.cpp from 7zip package.


unsigned long Blunderbuss(char *y, char *x, long n, int m)
   {
int Cycle;
unsigned long hits = 0;
long ThunderwithL = 0, ThunderwithR = 0;
int OneFailureCanRuinYou;

    if ( n == 0 ) return(0);
    for (;;) {

  //Search area is between y[0] .. y[n-1]
    while (y[ThunderwithR] != 10) {++ThunderwithR;} // Works both on UNIX(LF) and Windows(CRLF)
    while (ThunderwithL != 0 && y[--ThunderwithL] != 10) {}
    if (ThunderwithL != 0) ThunderwithL++;
    TotalLinesENC++;

 // To avoid nasty big delays(hours sometime) length of searched line must be limited:
 // recursive function is very heavy then!
 // Something like LBL960 in memory.

OneFailureCanRuinYou = 0; // Presumption for flushing.

 if (ThunderwithR - ThunderwithL <= 960) {
                                         }
                                         else
 OneFailureCanRuinYou = 1; // Presumption for NOT flushing.

if (OneFailureCanRuinYou == 0)
{
          if ( CompareWildCardWithName ( x, &y[ThunderwithL] ) )
          { //if ( memchrKAZE(&y[ThunderwithL], &TAGfree[0], ThunderwithR - ThunderwithL) == NULL && memchrKAZE(&y[ThunderwithL], &TAGfree[1], ThunderwithR - ThunderwithL) == NULL )
            { hits++; 
              /*
              for( Cycle = 0; Cycle < ThunderwithR - ThunderwithL + 1; Cycle++ )
              putchar( y[ThunderwithL + Cycle]);
              */
              //putchar( '\n');
              fwrite( &y[ThunderwithL], ThunderwithR - ThunderwithL + 1, 1, fp_outLOG );
            }
          } // Wildcard if
}
    ThunderwithL = ++ThunderwithR;
    if ( ThunderwithR >= n - 1 ) return(hits);

    } // for (;;)
   }


void x64toaKAZE (      /* stdcall is faster and smaller... Might as well use it for the helper. */
        unsigned long long val,
        char *buf,
        unsigned radix,
        int is_neg
        )
{
        char *p;                /* pointer to traverse string */
        char *firstdig;         /* pointer to first digit */
        char temp;              /* temp char */
        unsigned digval;        /* value of digit */

        p = buf;

        if ( is_neg )
        {
            *p++ = '-';         /* negative, so output '-' and negate */
            val = (unsigned long long)(-(long long)val);
        }

        firstdig = p;           /* save pointer to first digit */

        do {
            digval = (unsigned) (val % radix);
            val /= radix;       /* get next digit */

            /* convert to ascii and store */
            if (digval > 9)
                *p++ = (char) (digval - 10 + 'a');  /* a letter */
            else
                *p++ = (char) (digval + '0');       /* a digit */
        } while (val > 0);

        /* We now have the digit of the number in the buffer, but in reverse
           order.  Thus we reverse them now. */

        *p-- = '\0';            /* terminate string; p points to last digit */

        do {
            temp = *p;
            *p = *firstdig;
            *firstdig = temp;   /* swap *p and *firstdig */
            --p;
            ++firstdig;         /* advance to next two digits */
        } while (firstdig < p); /* repeat until halfway */
}

/* Actual functions just call conversion helper with neg flag set correctly,
   and return pointer to buffer. */

char * _i64toaKAZE (
        long long val,
        char *buf,
        int radix
        )
{
        x64toaKAZE((unsigned long long)val, buf, radix, (radix == 10 && val < 0));
        return buf;
}

char * _ui64toaKAZE (
        unsigned long long val,
        char *buf,
        int radix
        )
{
        x64toaKAZE(val, buf, radix, 0);
        return buf;
}

char * _ui64toaKAZEzerocomma (
        unsigned long long val,
        char *buf,
        int radix
        )
{
                        char *p;
                        char temp;
                        int txpman;
                        int pxnman;
        x64toaKAZE(val, buf, radix, 0);
                        p = buf;
                        do {
                        } while (*++p != '\0');
                        p--; // p points to last digit
                             // buf points to first digit
                        buf[26] = 0;
                        txpman = 1;
                        pxnman = 0;
                        do
                        { if (buf <= p)
                          { temp = *p;
                            buf[26-txpman] = temp; pxnman++;
                            p--;
                            if (pxnman % 3 == 0)
                            { txpman++;
                              buf[26-txpman] = (char) (',');
                            }
                          }
                          else
                          { buf[26-txpman] = (char) ('0'); pxnman++;
                            if (pxnman % 3 == 0)
                            { txpman++;
                              buf[26-txpman] = (char) (',');
                            }
                          }
                          txpman++;
                        } while (txpman <= 26);
        return buf;
}

char * _ui64toaKAZEcomma (
        unsigned long long val,
        char *buf,
        int radix
        )
{
                        char *p;
                        char temp;
                        int txpman;
                        int pxnman;
        x64toaKAZE(val, buf, radix, 0);
                        p = buf;
                        do {
                        } while (*++p != '\0');
                        p--; // p points to last digit
                             // buf points to first digit
                        buf[26] = 0;
                        txpman = 1;
                        pxnman = 0;
                        while (buf <= p)
                        { temp = *p;
                          buf[26-txpman] = temp; pxnman++;
                          p--;
                          if (pxnman % 3 == 0 && buf <= p)
                          { txpman++;
                            buf[26-txpman] = (char) (',');
                          }
                          txpman++;
                        } 
        return buf+26-(txpman-1);
}

int main( int argc, char *argv[])
  {
FILE *fp_in;
FILE *fp_in2;
FILE *fp_in3;
FILE *fp_inLINE;
FILE *fp_outLINE;

      time_t t1, t2, t3;
      char workK[1024*128];
      long workKoffset = -1;

unsigned long LongestLine;
unsigned long ShortestLine;
unsigned long DumpedLines=0;
unsigned long DumpedLinesOLD=0;

unsigned long long FilesLEN;
unsigned long k, k2, k3, LINE10len, LINE10len2, wrdlen;
unsigned long long NumberOfLines=0;
unsigned long size_in, size_in2;
int ChunkL, ChunkR, ChunkN;
char *ChunkName="FLC\0";

#if defined(_WIN32_ENVIRONMENT_)
      unsigned long long size_inLINESIXFOUR;
#else
      size_t size_inLINESIXFOUR;
#endif /* defined(_WIN32_ENVIRONMENT_)  */

char LINE10[257]; // 000..255, 256 = 0
char LINE10_2[257];
char wrd[MaxLineLength+1+1]; // crlf
char workbyte;

unsigned long long int i, j;
char llTOaDigits[27]; // 9,223,372,036,854,775,807: 1(sign or carry)+19(digits)+1('\0')+6(,)

char SkipNextSubStagesFlag=0;

/*
The best balance speed/size is to divide x-grams into fixed-length files (thus binary search allowed easily).
Incoming phrases are searched likewise i.e. grouping them by fixed-length criterion - thus some locality (drive cache friendly) is benefited.

For example file with 3-grams is to be divided into '0,000,000\t' + 9-41 which equals range (10+9)-(10+41) or (41+10)-(9+10)+1=33 files.

Next 3-grams
0,139,954       it_can_be
0,034,168       he_was_an
become into chunks
he_was_an       0,034,168
it_can_be       0,139,954
of course resorted by first field.

And CRLF are 2bytes more i.e. 'record' is (10+9+2)-(10+41+2).

_x-gram_  1 (10+1+2)-(10+31+2)		(10+31+2)-(10+1+2)+1=	31
_x-gram_  2 (10+5+2)-(10+41+2)		(10+41+2)-(10+5+2)+1=	37
_x-gram_  3 (10+9+2)-(10+41+2)		(10+41+2)-(10+9+2)+1=	33
_x-gram_  4 (10+13+2)-(10+51+2)		(10+51+2)-(10+13+2)+1=	39
_x-gram_  5 (10+17+2)-(10+61+2)		(10+61+2)-(10+17+2)+1=	45
_x-gram_  6 (10+21+2)-(10+71+2)		(10+71+2)-(10+21+2)+1=	51
_x-gram_  7 (10+25+2)-(10+81+2)		(10+81+2)-(10+25+2)+1=	57
_x-gram_  8 (10+29+2)-(10+91+2)		(10+91+2)-(10+29+2)+1=	63
_x-gram_  9 (10+33+2)-(10+101+2)	(10+101+2)-(10+33+2)+1=	69
_x-gram_ 10 (10+37+2)-(10+111+2)	(10+111+2)-(10+37+2)+1=	75
*/

printf("FLC_Fixed-Length-Chunker, revision 1, written by Kaze.\n");
if (argc != 3) {
printf("Purpose: It chunkizes a given x-gram file to fixed-length files.\n");
printf("Usage: FLC_Fixed-Length-Chunker.exe filename {/1|/2|/3|/4|/5|/6|/7|/8|/9}\n");
printf("Example: FLC_Fixed-Length-Chunker.exe 4andabove_Gamera.tar.3.sorted /3\n");
exit (2);
}

if ( argc == 3 && strcmp("/1\0",argv[2]) == 0 ) {ChunkL = (10+1+2); ChunkR = (10+31+2); ChunkN = 31;}
if ( argc == 3 && strcmp("/2\0",argv[2]) == 0 ) {ChunkL = (10+5+2); ChunkR = (10+41+2); ChunkN = 37;}
if ( argc == 3 && strcmp("/3\0",argv[2]) == 0 ) {ChunkL = (10+9+2); ChunkR = (10+41+2); ChunkN = 33;}
if ( argc == 3 && strcmp("/4\0",argv[2]) == 0 ) {ChunkL = (10+13+2); ChunkR = (10+51+2); ChunkN = 39;}
if ( argc == 3 && strcmp("/5\0",argv[2]) == 0 ) {ChunkL = (10+17+2); ChunkR = (10+61+2); ChunkN = 45;}
if ( argc == 3 && strcmp("/6\0",argv[2]) == 0 ) {ChunkL = (10+21+2); ChunkR = (10+71+2); ChunkN = 51;}
if ( argc == 3 && strcmp("/7\0",argv[2]) == 0 ) {ChunkL = (10+25+2); ChunkR = (10+81+2); ChunkN = 57;}
if ( argc == 3 && strcmp("/8\0",argv[2]) == 0 ) {ChunkL = (10+29+2); ChunkR = (10+91+2); ChunkN = 63;}
if ( argc == 3 && strcmp("/9\0",argv[2]) == 0 ) {ChunkL = (10+33+2); ChunkR = (10+101+2); ChunkN = 69;}

(void) time(&t1);

        for( k = ChunkL; k <= ChunkR; k++ )
	{

memset (&LINE10[0], 0, 257);
memcpy (&LINE10[0], ChunkName, 3);
memcpy (&LINE10[0]+3, _ui64toaKAZEzerocomma(k, llTOaDigits, 10) +(26-3), 3);
LINE10[3+3]='_';
memcpy (&LINE10[0]+3+3+1, argv[1], strlen(argv[1]));
printf( "Creating: %s\n", LINE10 ); 

if( ( fp_outLOG = fopen( LINE10, "wb" ) ) == NULL )
{ printf( "FLC_Fixed-Length-Chunker: Can't open %s file.\n", LINE10 ); return( 1 ); }

if( ( fp_inLINE = fopen( argv[1], "rb" ) ) == NULL )
{ printf( "FLC_Fixed-Length-Chunker: Can't open %s file.\n", argv[1] ); return( 1 ); }

#if defined(_WIN32_ENVIRONMENT_)
   // 64bit:
_lseeki64( fileno(fp_inLINE), 0L, SEEK_END );
size_inLINESIXFOUR = _telli64( fileno(fp_inLINE) );
_lseeki64( fileno(fp_inLINE), 0L, SEEK_SET );
#else
   // 64bit:
fseeko( fp_inLINE, 0L, SEEK_END );
size_inLINESIXFOUR = ftello( fp_inLINE );
fseeko( fp_inLINE, 0L, SEEK_SET );
#endif /* defined(_WIN32_ENVIRONMENT_)  */

        //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	NumberOfLines = 0;
	wrdlen = 0;
        for( i = 0; i < size_inLINESIXFOUR; i++ )
	{

                // ~~~~~~~~~~~~ Buffering fread, 10x faster [
                if (workKoffset == -1) {
                        if (i + 1024*128 < size_inLINESIXFOUR) {
                                fread( &workK[0], 1, 1024*128, fp_inLINE );
                                workKoffset = 0;
                                workbyte = workK[workKoffset];
                        } else 
                        fread( &workbyte, 1, 1, fp_inLINE );
                } else {
                        workKoffset++;
                        workbyte = workK[workKoffset];
                        if (workKoffset == 1024*128 - 1) workKoffset = -1;
                }
                // ~~~~~~~~~~~~ Buffering fread, 10x faster ]
               
                // ~~~~~~~~~~~~ UnBuffered fread, 10x slower [
                        //fread( &workbyte, 1, 1, fp_inLINE );
                // ~~~~~~~~~~~~ UnBuffered fread, 10x slower ]

                        if( wrdlen < MaxLineLength +1+1)
                        { wrd[ wrdlen ] = workbyte; }

                        if (workbyte == 10) {
	                        if ( wrdlen+1 == k ) {
					NumberOfLines++;
	        	                fwrite( wrd, 1, wrdlen+1, fp_outLOG );
				}
				wrdlen = 0;
                        }
                        else wrdlen++;

        } // i 'for'
        //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

printf( "Encountered lines: %s\n", _ui64toaKAZEcomma(NumberOfLines, llTOaDigits, 10) ); 
fclose( fp_inLINE );
fflush(fp_outLOG); 

	} // k 'for'

(void) time(&t3);
if (t3 <= t1) {t3 = t1; t3++;}

printf( "Done in %s seconds.\n", _ui64toaKAZEcomma(t3 - t1, llTOaDigits, 10) ); 
return(0);
  }
