#include <GL/glew.h>
#include <GLFW/glfw3.h>
#include <arpa/inet.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>

#define SELFTEST
#ifdef SELFTEST
#include <openssl/md5.h>
#endif

#define MAX(a,b) (((a)>(b))?(a):(b))
#define MIN(a,b) (((a)<(b))?(a):(b))

// shader glsl source
extern unsigned char md5_glsl[];
extern unsigned int md5_glsl_len;

static inline uint64_t now() {
    static struct timeval tv;
    (void)gettimeofday(&tv, NULL);
    return (uint64_t)(tv.tv_sec*1000) + (uint64_t)(tv.tv_usec/1000);
}

static unsigned dlog(unsigned n, unsigned max) {
    uint64_t rv = n;
    while ((rv+1) * (uint64_t)n <= (uint64_t)max) {
        rv = (rv+1) * (uint64_t)n;
    }
    return (unsigned)rv;
}

static const unsigned max_shaders = 65536;
static const unsigned iterations = 1;

static const uint32_t chars[] = {
    0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, // a-z
    0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, // A-Z
    0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, // 0-9
    0x21, 0x23, 0x24, 0x25, 0x26, 0x28, 0x29, 0x2d, 0x2e, 0x2f, 0x3f, 0x40, 0x5f // ! # $ % & ( ) - . / ? @ _
};
static const unsigned numchars = sizeof(chars)/sizeof(chars[0]);

static const unsigned maxcharuint = dlog(numchars, 0xffffu-max_shaders); // e.g. "_____"
static const unsigned mincharuint = dlog(numchars, maxcharuint-1) + 1;   // e.g. "aaaaa"

static void id2hilo(uint64_t i, uint32_t& h, uint32_t& l) {
    if (i <= maxcharuint) {
        l = i;
        h = 0;
    } else {
        l = mincharuint + ((i-maxcharuint-1) % (maxcharuint-mincharuint+1));
        h = 1 +           ((i-maxcharuint-1) / (maxcharuint-mincharuint+1));
    }
}
static uint64_t hilo2id(uint32_t h, uint32_t l) {
    if (!h) {
        return l;
    } else {
        return (((uint64_t)h-1) * (maxcharuint-mincharuint+1)) + ((uint64_t)l-mincharuint) + maxcharuint + 1;
    }
}
static const char* id2str(uint32_t h, uint32_t l) { // must match glsl's semantics
    static unsigned char sstr[2][12];
    static unsigned os = 0;
    os = (os+1) % 2;
    unsigned char* str = sstr[os];

    unsigned len = 0;
    while (l > 0u) {
        l--;
        uint32_t rem = l % numchars;
        l /= numchars;
        str[len] = (unsigned char)chars[rem];
        len++;
    }
    while (h > 0u) {
        h--;
        uint32_t rem = h % numchars;
        h /= numchars;
        str[len] = (unsigned char)chars[rem];
        len++;
    }

    str[len] = '\0';
    return (const char*)str;
}
static const char* id2str(uint64_t i) {
    uint32_t h, l;
    id2hilo(i, h, l);
    return id2str(h, l);
}

static inline GLuint atomic_get() {
    GLuint val = *(GLuint*)glMapBuffer(GL_ATOMIC_COUNTER_BUFFER, GL_READ_ONLY); // TODO: http://stackoverflow.com/questions/33993877/shader-storage-buffer-contents-of-varying-size-transfered-to-array-buffer
    glUnmapBuffer(GL_ATOMIC_COUNTER_BUFFER);
    return val;
}
static inline void atomic_set(GLuint val) {
    *(GLuint*)glMapBufferRange(GL_ATOMIC_COUNTER_BUFFER, 0, sizeof(GLuint), GL_MAP_WRITE_BIT|GL_MAP_INVALIDATE_BUFFER_BIT|GL_MAP_UNSYNCHRONIZED_BIT) = val;
    glUnmapBuffer(GL_ATOMIC_COUNTER_BUFFER);
}


int main(int argc, char** argv) {
    // parse input
    if (argc != 2) return 1;
    uint32_t hash[4];
    if (sscanf(argv[1], "%8x%8x%8x%8x", &hash[0], &hash[1], &hash[2], &hash[3]) != 4) return 1;
    printf("looking for %8x %8x %8x %8x...\n", htonl(hash[0]), htonl(hash[1]), htonl(hash[2]), htonl(hash[3]));

    // determine parallelism
    const size_t vertices_num = MIN(GL_MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS, max_shaders); // we can fetch this # of uints for feedback
    printf("will use %zd vertices\n", vertices_num);

    // string generation debug output
    printf("will use %u chars, %u/%u %s/%s min/max hi/lo\n", numchars, mincharuint, maxcharuint, id2str(mincharuint), id2str(maxcharuint));

    // init glfw
    glfwInit();
    glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 3);
    glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 2);
    glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE);
    glfwWindowHint(GLFW_OPENGL_FORWARD_COMPAT, GL_TRUE);
    glfwWindowHint(GLFW_RESIZABLE, GL_FALSE);
    GLFWwindow* win = glfwCreateWindow(10, 10, argv[0], NULL, NULL); // TODO: can get rid of window?
    if (!win) return 1;
    glfwMakeContextCurrent(win);

    // init glew
    glewExperimental = GL_TRUE;
    if (glewInit() != GLEW_OK) return 1;
    (void)glGetError();

    // set some gl settings
    glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
    glEnable(GL_RASTERIZER_DISCARD); // TODO: find other optimizations

    // create main vertex shader
    GLuint shader = glCreateShader(GL_VERTEX_SHADER);
    const char* glsl[1] = { (const char*)md5_glsl };
    glShaderSource(shader, 1, glsl, (int*)&md5_glsl_len);
    glCompileShader(shader);
    GLint status;
    glGetShaderiv(shader, GL_COMPILE_STATUS, &status);
    if (status != GL_TRUE) {
        static char buffer[512];
        glGetShaderInfoLog(shader, sizeof(buffer), NULL, buffer);
        printf("%s\n", buffer);
        glDeleteShader(shader);
        return 1;
    }

    // create program and register feedback for the output var
    GLuint program = glCreateProgram();
    glAttachShader(program, shader);
    const GLchar* feedbackVaryings[] = { "outValue" };
    glTransformFeedbackVaryings(program, 1, feedbackVaryings, GL_INTERLEAVED_ATTRIBS);
    glLinkProgram(program);
    glUseProgram(program);

    // create vertex array VAO XXX: needed?
    GLuint vao;
    glGenVertexArrays(1, &vao);
    glBindVertexArray(vao);

    // create VBO with [0,vertices_num[ as per-shader id/offset for input var
    GLuint vbo;
    glGenBuffers(1, &vbo);
    glBindBuffer(GL_ARRAY_BUFFER, vbo);
    GLuint data[vertices_num];
    for (unsigned i=0; i<vertices_num; i++) data[i] = i*iterations;
    glBufferData(GL_ARRAY_BUFFER, sizeof(data), data, GL_STATIC_DRAW);
    GLint inputAttrib = glGetAttribLocation(program, "inValue");
    glEnableVertexAttribArray(inputAttrib);
    glVertexAttribIPointer(inputAttrib, 1, GL_UNSIGNED_INT, 0, 0);

    // create feedback buffer TBO
    GLuint feedback[vertices_num];
    GLuint tbo;
    glGenBuffers(1, &tbo);
    glBindBuffer(GL_ARRAY_BUFFER, tbo);
    glBufferData(GL_ARRAY_BUFFER, sizeof(feedback), NULL, GL_STATIC_READ);
    glBindBufferBase(GL_TRANSFORM_FEEDBACK_BUFFER, 0, tbo);

    // create buffer for the atomic counter signalling success
    GLuint abo;
    glGenBuffers(1, &abo);
    glBindBuffer(GL_ATOMIC_COUNTER_BUFFER, abo);
    glBufferData(GL_ATOMIC_COUNTER_BUFFER, sizeof(GLuint), NULL, GL_STATIC_DRAW); // XXX: GL_DYNAMIC_COPY, GL_DYNAMIC_DRAW?
    atomic_set(0);
    glBindBufferBase(GL_ATOMIC_COUNTER_BUFFER, 0, abo); // needed?

    // get/set uniform inputs
    glUniform1uiv(glGetUniformLocation(program, "chars"), numchars, chars);
    glUniform1ui(glGetUniformLocation(program, "numchars"), numchars);
    glUniform2ui(glGetUniformLocation(program, "limituint"), mincharuint, maxcharuint);
    glUniform1ui(glGetUniformLocation(program, "iterations"), iterations);
    glUniform4ui(glGetUniformLocation(program, "needle"), htonl(hash[0]), htonl(hash[1]), htonl(hash[2]), htonl(hash[3]));
    glUniform2ui(glGetUniformLocation(program, "prefix"), 0, 0); // XXX: Set salt and its length here. TODO: parse from cmdline
    GLint uniOffset = glGetUniformLocation(program, "offset");

    // do some test-runs at interesting corner cases
    #ifndef SELFTEST
        printf("skipping self-test\n");
    #else
        printf("performing self-test...\n");
        assert(iterations == 1); // TODO:
        for (unsigned i=maxcharuint-3; i<=maxcharuint+3; ++i) {
            const char* str = id2str(i); // prefix, too?;
            size_t len = strlen(str);

            uint32_t digest[4];
            MD5_CTX c;
            MD5_Init(&c);
            MD5_Update(&c, str, len);
            MD5_Final((unsigned char*)digest, &c);

            static const size_t shader_num = 10;
            static const size_t shader_no = vertices_num-(shader_num/2)+1;
            uint32_t h, l;
            id2hilo(i-shader_no, h, l);
            glUniform2ui(uniOffset, h, l);

            glUniform4ui(glGetUniformLocation(program, "needle"), digest[0], digest[1], digest[2], digest[3]);
            glDrawArrays(GL_POINTS, vertices_num-shader_num, shader_num);

            if (atomic_get() != 1) {
                printf("failed!\n");
                return 1;
            }
            atomic_set(0);

            glUniform4ui(glGetUniformLocation(program, "needle"), ~digest[0], digest[1], digest[2], digest[3]);
            glDrawArrays(GL_POINTS, vertices_num-shader_num, shader_num);

            if (atomic_get() != 0) {
                printf("failed!\n");
                return 1;
            }

            //printf("passed: '%s' (%u)\n", str, i);
        }
        // reset
        glUniform4ui(glGetUniformLocation(program, "needle"), htonl(hash[0]), htonl(hash[1]), htonl(hash[2]), htonl(hash[3]));
    #endif

    // main loop
    printf("starting...\n");
    const uint64_t start = now();
    unsigned last_check = 0;
    bool found_result = false;
    for (unsigned num=0; num<100000000; num++) {
        // set shader input offset
        uint32_t off_hi, off_lo;
        id2hilo((uint64_t)num * (uint64_t)vertices_num * (uint64_t)iterations, off_hi, off_lo);
        glUniform2ui(uniOffset, off_hi, off_lo);

        // draw or redraw without or with feedback capture, resp.
        if (found_result) glBeginTransformFeedback(GL_POINTS);
        glDrawArrays(GL_POINTS, 0, vertices_num);
        if (found_result) glEndTransformFeedback();
        glFlush();

        // check the success counter only from time to time
        if (!found_result) { // always check below if already found before
            if (last_check + 5000 > num) {
                continue; // checked shortly before
            } else {
                printf("checking '%s' (%lu,%u+%u)...\n", id2str(off_hi, off_lo), hilo2id(off_hi, off_lo), off_hi, off_lo);
            }
        }

        // check atomic counter for great success and go on if not. can do this while shaders run for the last run's output?
        bool great_success = atomic_get() > 0;
        if (!great_success) {
            last_check = num;
            continue;
        }

        // result for the first time? then retry everything since last checkpoint w/ feedback
        if (!found_result) {
            found_result = true;
            atomic_set(0);
            printf("FOUND: will re-draw %u rounds\n", num-last_check);
            num = last_check-1; // can underflow
            continue;
        } else {
            printf("FOUND: will check result\n");
        }

        // get capture and find the shader that has triggered success. TODO: can replace counter+feedback with some other, single buf?
        glGetBufferSubData(GL_TRANSFORM_FEEDBACK_BUFFER, 0, sizeof(feedback), feedback);
        for (unsigned i=0; i<vertices_num; i++) {
            if (feedback[i] != 0) {
                printf("FOUND: feedback %u, shader %u, round %u, offset %lu,%u+%u+%u\n", feedback[i], i, num, hilo2id(off_hi, off_lo), off_hi, off_lo, i);
                printf("FOUND: result '%s'\n", id2str(hilo2id(off_hi, off_lo) + (uint64_t)(feedback[i] & ~0x80000000u)));
                goto found;
            }
        }
        assert(false);
    }
    found:
    const uint64_t end = now();
    printf("took %lumsec\n", end-start);

    // some cleanup
    glDeleteProgram(program);
    glDeleteShader(shader);
    glDeleteBuffers(1, &tbo);
    glDeleteBuffers(1, &vbo);
    glDeleteBuffers(1, &abo);
    glDeleteVertexArrays(1, &vao);
    return 0;
}