/**********************************************************
kasiski.c  (Project 1)


Written by: XXXXXXXXXXXXXXXXXX Former 443 student
Course:     CS443
Instructor: Dr. A. Brooke Stephens

Written in: ANSI C.

To compile: gcc kasis.c util.c -o kasis -lm ..assuming name is kasis.c

Note: This program is completely automated and does not
      require user interaction.


Input: This program takes the file name of some cipher text
       and a filename to store program created data.

Output: This program will output to supplied output_filename
        all data pertaining to the Kasiski method for
        finding key word length on Vigenere cipher created text.

Command Line execution:
   $> kasiski <ciphertext_input_filename> <output_data_filename>


Description:
   This program retrieves from a file, ciphertext that was
created by the Vigenere method.  It applies the Kasiski method
of keyword length retrieval.  The program performs the following
procedure:
   
      1) Retrieve data from ciphertext file.
      2) Take each line of text and combine into
         one long character string, with all new line characters
         removed.
      3) Traverse the new character string for all occurrences
         of strings consisting of 3 or more characters.
      4) While calculating step 3, ensure that the algorithm
         does not register occurrences for strings that
         are equal to or a subset of any string seen before.
 
         Assumtion: A larger string is more important than
         any subset string for calculating the keyword length.

      5) With each occurrence of every string, register into
         an array all positions of the occurrences, relative
         to the beginning of the search string and the ciphertext
         first character.

      6) Build another array consisting of distance measurements
         of some particular repetition string from the position of
         its very first occurrence.  Do this for all occurrences.

      7) Find all greatest common divisors of all the measurements,
         and the greatest number of hits, will most likely represent
         the keyword lenght (or a multiple there of).

      8) Output Hypothesis.

***********************************************************/
#define FALSE 0
#define TRUE 1

#include <string.h>
#include <stdlib.h>
#include"kasiski.h"
#include"util.h"



main(int argc, char *argv[])
{
  FILE *istream,            /* cipher text stream */
       *ostream;            /* output kasiski test */
  char *lines[MAXSIZE];     /* store all lines of cipher text file. */
  char *cline;              /* Single line of ciphertext.
                               Kasiski is performed on this varaible. */
  int  nchars= 0;           /* number of characters in ciphertext.*/

  /* Indexes into ciphertext for processing. */
  int  i, j;
  int  k, l, m;
  int  count;

  int  len,                 /* Current length of substring
                               search for # of occurrences. */
       ngrams;              /* Maximum # of substring length searches.*/
  char str[MAXLINE];        /* current substring, used for occurrence
                               searching. */
  char found[MAXSIZE][MAXSIZE]; /* Array of substrings already
                                   registered with occurrences. */
  int  distances[MAXSIZE];      /* Array of all the calculated,
                                   individual distances of all
                                   occurrences. */
  int  positions[MAXSIZE];      /* Array of all current positions
                                   for some surrent substring. */
  int  freq;                    /* Counter for number of string groups 
                                   found for n number of occurrences. */
  int  d;                       /* Total number of distances. */
  int  largestd;                /* largest distance measurement found. */
  int  ndiv[MAXSIZE];           /* A simple array to store the number
                                   of individual total hits of
                                   all numbers less than the maximum
                                   (largestd) and divisible by some
                                   and all distances. */
  int  cipherLen;               /* Hypothesized cipher keyword length. */
  char exist;                   /* Boolean operand */




  /* Allocate memory for all lines of ciphertext and initialize to 0. */
  for(i= 0; i< MAXLINE; i++)
   {
     if((lines[i]= (char *) malloc(MAXSIZE))== NULL)
       {
         fprintf(stderr, "ERROR: Cannot allocate memory for \"lines\"...\n");
         exit(1);
       }
     for(j= 0; j< MAXSIZE; j++)
      lines[i][j]= 0;
   }


  /* Check to see if the user executed program with the right number
     of arguments.  If not, then output useage statement and exit. */
  if(argc < 3)
    {
     fprintf(stderr,"USAGE: %s <ciphertext_file> <output_file>\n\n", argv[0]);
     exit(1);
    }


  /* Open all files. */
  if((istream=fopen(argv[1],"r")) == NULL)
    {
      fprintf(stderr,"ERROR: Could not read in file: %s\n\n", argv[1]);
      exit(-1);
    }
  if((ostream=fopen(argv[2],"w")) == NULL)
    {
      fprintf(stderr, "ERROR: could not open file %s, for write...\n\n",
              argv[2]);
      exit(1);
    }



  fprintf(ostream, "Original lines of Cipher Text are:\n");
  fprintf(ostream, "-----------------------------------\n\n");

  /* Retrieve all lines of ciphertext from input file. */
  count= 0;
  while(fgets(lines[count], MAXSIZE, istream) != NULL)
   {
     fprintf(ostream,"%s", lines[count]);
     nchars+= strlen(lines[count]) - 1;
     count++;
   }
  fclose(istream);




  fprintf(ostream, "\nNumber of Character in Cipher Text: %d\n", nchars);
  fprintf(ostream, "\n\n");
  fflush(ostream);

  /* Allocate memory for single character string of ciphertext. */
  if((cline= (char *) malloc(MAXLINE*MAXSIZE)) == NULL)
   {
     fprintf(stderr, "ERROR: Could not allocate memory for \"cline\"...\n");
     exit(1);
   }


  /* Create a single line of cipher text */
  for(i= 0; i< (MAXLINE*MAXSIZE); i++)
   *(cline+i)= 0;

  for(i= 0, k= 0; i< count; i++)
   {
     for(j= 0; j< strlen(lines[i]); j++)
      {
       if(lines[i][j] != '\n')
        {
          *(cline+k)= up2LowerCase(lines[i][j]);
          k++;
        }
      }
   }
  




  /* Search for all repeting strings, 3 or greater, up to 8 */
  
  for(i= 0; i< MAXLINE; i++)
   for(j= 0; j< MAXLINE; j++)
     found[i][j]= 0;

  ngrams= 8;      /*(int)((float) k/ 2.0)*/

  l= 0;     /* index list of known occurrences. (Used with found) */
  d= 0;     /* index list of distance measures of occurrences. */

  for(len= ngrams; len> 2; len--)
    {
      fprintf(ostream, "\n\nSearching for numbers of occurrences in %d lettered strings:\n", len);
      fprintf(ostream, "NOTE: numbers contained in [], represent distances\n");
      fprintf(ostream, "      from its corresponding position, to the first.\n");
      fprintf(ostream, "-------------------------------------------------------------------\n");

      /* Find occurrences of n character strings */
      for(i= 0; i< (k-len); i++)
       {
         freq= 0;
         strncpy(str, (cline+i), len);
         str[len]= 0;
    

         /* Check if string has been seen before. */
         for(m= 0, exist= FALSE; ((m< l) && (!exist)); m++)
          {
           if(subsetP(str, len, found[m], strlen(found[m])))
             exist= TRUE;        /* Already looked at string. */
          }



         /* If string has not been seen before, then search
            cipher text for another set of occurrences. */
         if(!exist)
          {
            for(j= 0; j< (k-len); j++)
             {
               if((strncmp(str, (cline+j), len)== 0)
                  && !containSpecialCharactersP(str, len))
                 {
                   positions[freq]= j+1;  /* Record position of some
                                             occurrence of a string. */
                   freq++;                /* Increment occurrence of string. */
                 }
             }



            /* If there were more than 1 occurrence of a string,
               then register it. */
            if(freq> 1)
             {
               fprintf(ostream, "%s was found %d times. Positions: ", str, freq);
               fprintf(ostream, "%d", positions[0]);


               /* Print out all positions of a certain string occurrence. */
               for(m= 1; m< freq; m++)
                {
                  distances[d]= positions[m] - positions[0];
                  d++;
                  fprintf(ostream, ", %d [%d]", positions[m], 
                                                (positions[m] - positions[0]));
                }
               fprintf(ostream, "\n");

               strncpy(found[l], str, len);
               l++;
             }
          }
       }
    }
  fprintf(ostream, "\n");





  /* Output all calculated distances from ALL occurrences. */
  fprintf(ostream, "\n\nDistances:\n");
  fprintf(ostream, "-------------\n");
  largestd= 0;
  for(i= 0; i< d; i++)
   {
     fprintf(ostream, "%d\n", distances[i]);
     if(distances[i] > largestd)
       largestd= distances[i];
   }


  




  /* Go through all POSSIBLE divisors up to largestd and
     register for each divisor how many times it finds a number
     that it can divide into, from all the numbers in the array
     of distance measurements.  It is conceivable, that the divisor
     that divides the most distance will most likely represent the
     hypothesized keyword length.  This is what is calculated.*/

  /* Initialize ndiv to 0 */
  for(i= 0; i< largestd; i++)
    ndiv[i]= 0;

  for(i= 1; i< largestd; i++)
    for(j= 0; j< d; j++)
      if((distances[j] % i) == 0)
        ndiv[i]++;


  /* Output length distribution */
  fprintf(ostream, "\n\nLengths:\n");
  fprintf(ostream, "\n");

  fprintf(ostream, "Divisor (length)    # of hits\n");
  fprintf(ostream, "----------------    ----------\n");
  cipherLen= 2;
  for(i= 2; i< largestd; i++)
   {
     if(ndiv[i] != 0)
       fprintf(ostream, "   %5d ->          %5d\n", i, ndiv[i]);

     if(ndiv[i] > ndiv[cipherLen])
       cipherLen= i;
   }

  fprintf(ostream, "\n******************\n");
  fprintf(ostream, "Hypothesis:\n");
  fprintf(ostream, "Cipher Key length, based off the largest\n");
  fprintf(ostream, "occurrences of divisible numbers is most likely\n");
  fprintf(ostream, "some multiple of %d characters long.\n", cipherLen);
    




  /* Clean up. */
  fclose(ostream);

  free(cline);
  for(i= 0; i< count; i++)
   free(lines[i]);
  
  printf("Done...\n");

  exit(0);
} /* main */
/* End of Program */

