/*
 * compo02.cpp
 *
 * Hugi compo #2 entry
 *
 * Author:  AZ
 * Country: Belarus
 *
 * Version 1.3
 *
 * Tested with VC++ 4.0
 *
 */

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <string.h>
#include <ctype.h>

#undef DEBUG

typedef unsigned char byte;
byte data[2048];
int  data_ptr = 0;
int  data_bits = 0;

void put_bits(unsigned long bits, int n_bits)
{
    assert(bits < (1u << n_bits));
    bits <<= (sizeof(bits)*8 - n_bits);
    while (n_bits != 0)
    {
        if (data_bits == 0) data[data_ptr] = 0; else data[data_ptr] <<= 1;
        if ((bits & (1 << (sizeof(bits)*8-1))) != 0) data[data_ptr]++;
        if ((data_bits = (++data_bits & (sizeof(byte)*8-1))) == 0) data_ptr++;
        bits <<= 1;
        n_bits--;
    };
};

void flush_bits(void)
{
    if (data_bits != 0)
    {
        data[data_ptr++] <<= sizeof(byte)*8 - data_bits;
        data_bits = 0;
    };
};

void main(void)
{
	char text[1024];
	int textsize = -1;
	FILE *ftext = fopen("text.txt", "rb");
	assert(ftext != NULL);
	textsize = fread(text, 1, sizeof(text), ftext);
	assert(textsize < sizeof(text));
	fclose(ftext);

	int char_count[256];
	memset(char_count, 0, sizeof(char_count));
	int i;
	for (i = 0; i < textsize; i++)
	{
		char_count[text[i]]++;
	};

#ifdef DEBUG
	printf("Char frequences:\n");
	int diff_chars = 0;
	for (i = 0; i < 256; i++)
	{
		if (char_count[i] != 0)
		{
			if (i <= ' ') printf("0x%02x", i); else printf("%c", char(i));
			printf(" - %d\n", char_count[i]);
			diff_chars++;
		};
	};
	printf("Total different chars: %d\n", diff_chars);
#endif

	int c_single_chars = 0;
	int c_links = 0;
	int longest_match = 0;
	int farthest_link = 0;
    int c_short_links = 0;
    int c_long_links = 0;
    int c_spaces = 0;
	memset(char_count, 0, sizeof(char_count));
   	for (int src = 0; src < textsize;)
	{
		int best_match = 0;
		int best_pos = -1;

		for (int prev = src - 511; prev < src; prev++)
		{
			if (text[prev] != text[src]) continue;

			int match_size;
			for (match_size = 0; src + match_size < textsize; match_size++)
			{
				if (text[prev + match_size] != text[src + match_size]) break;
			};

			if (match_size >= best_match) // closer is better
			{
				best_match = match_size;
				best_pos = prev;
			};
		};

		if (best_match < 4)
		{
			char c = tolower(text[src]);
#ifdef DEBUG
			if (c != ' ')
            {
                if (c <= ' ') printf("0x%02x", int(c)); else printf("%c", char(c));
    			printf("\n");
            };
#endif

            char_count[c]++;
			src++;
			c_single_chars++;

            byte enc_char = c;
            byte put_char = 0;
            if (c < 'a' || c > 'z')
            {
                switch (c)
                {
                case 0x0d:

                    enc_char = 0x1a;
                    assert(text[src] == 0x0a);
                    src++;
                    break;

                case 0x20:

                    /*
                    if (src < textsize-1 && text[src] == 0x20)
                    {
#ifdef DEBUG
                        printf("0x2020\n");
#endif
                        enc_char = 'j'-'a';
                        src++;
                    }
                    else
                    {
#ifdef DEBUG
                        printf("0x20\n");
#endif
                        enc_char = 0x1b;
                    };
                    */
#ifdef DEBUG
                    printf("0x20\n");
#endif
                    put_bits(0, 2); // 00b - space
                    put_char = 1;
                    c_spaces++;
                    break;

                case ',':

                    enc_char = 0x1c;
                    break;

                case '-':

                    enc_char = 0x1d;
                    break;

                case '.':

                    enc_char = 0x1e;
                    break;

                case '?':

                    /*
                    assert(text[src+0] == 0x0d);
                    assert(text[src+1] == 0x0a);
                    assert(text[src+2] == 0x20);
                    assert(text[src+3] == 0x20);
                    assert(text[src+4] == 0x20);
                    src += 5;
                    enc_char = 0;//0x1f;
                    */
                    enc_char = 0x1f;
                    break;

                default:

                    assert(0);
                    break;
                };
            }
            else
            {
                enc_char -= 'a';
            };
            if (!put_char)
            {
                put_bits(1, 1);
                put_bits(enc_char, 5);
            };
		}
		else
		{
            int capitalise = 0;
            for (int i = src+best_match; i < textsize; i++)
            {
                if (text[i] == '.' || text[i] == '?') break;
                if (isalpha(text[i]))
                {
                    capitalise = isupper(text[i]) ? 1 : 0;
                    break;
                };
            };

            {
                //put_bits('x'-'a', 5); // 01b - link
                put_bits(1, 2); // link
                put_bits(src - best_pos, 9); // where
                if (capitalise) put_bits(0, 4);
                put_bits(best_match-2, 4); // length
            };

#ifdef DEBUG
            printf("match: pos=%d, len=%d [%s]\n", (best_pos-src), best_match, (capitalise ? "up" : "lo"));
#endif

			src += best_match;
			c_links++;
			if (best_match > longest_match) longest_match = best_match;
			if (src - best_pos > farthest_link) farthest_link = src - best_pos;

            if (src - best_pos >= 128)
            {
                c_long_links++;
            }
            else
            {
                c_short_links++;
            };
		};
	};
#ifdef DEBUG
	printf("Char frequences (after compression):\n");
	diff_chars = 0;
	for (i = 0; i < 256; i++)
	{
		if (char_count[i] != 0)
		{
			if (i <= ' ') printf("0x%02x", i); else printf("%c", char(i));
			printf(" - %d\n", char_count[i]);
			diff_chars++;
		};
	};

	printf("Total different chars: %d\n", diff_chars);
	printf("Single chars (except spaces): %d\n", c_single_chars-c_spaces);
	printf("Links: %d\n", c_links);
	printf("Long links: %d\n", c_long_links);
	printf("Short links: %d\n", c_short_links);
	printf("Longest match: %d\n", longest_match);
	printf("Farthest link: %d\n", farthest_link);
#endif

	printf("Compressed data size estimate: %d bytes\n", ((c_single_chars + 1 /* eof */) * 5 + c_links * 18 + 7) / 8);

    put_bits(1, 1); // end of stream
    put_bits('z'-'a', 5); // end of stream
    flush_bits();

	printf("Actual compressed data size: %d bytes\n", data_ptr);

    FILE *ftemplate = fopen("compo02.dec", "rb");
    assert(ftemplate != NULL);
    FILE *fcom = fopen("compo02.com", "wb");
    assert(fcom != NULL);
    byte code[1024];
    int code_size = fread(code, 1, sizeof(code), ftemplate);
    fwrite(code, 1, code_size, fcom);
    fwrite(data, 1, data_ptr, fcom);
    fclose(ftemplate);
    fclose(fcom);
};
