/*---------------------------------------------------------------
 * File:	kgram.cc
 *
 * Author:	Liu Yang
 * Date:	Aug 2, 2009
 *
 * TO IMPLEMENT MULTI-BYTE MATCHING
 * --------------------------------------------------------------
 *  $Log $
 * */

#include <fstream>
#include <map>
#include <set>
#include <string.h>
#include <stdlib.h>
#include "kgram.h"

#define TRACE_ALPHA_RDT
#define TRACE_DIGRAM_RDT
//#define TRACE_ALPHA_LOADING
#define TRACE_RDT_TRANS_GEN

#define TRACE_FILL_RDT
#define TRACE_FILL_TRANS

using namespace std;
using std::ifstream;

void kgram_nfa::fill_trans_tab(char *f_nfa) {

  unsigned int num_accept_ids, num_accept_states, num_starts, num_transitions;
  unsigned int tmp_st, tmp_acc_id, tmps;
  //multimap<unsigned int, unsigned int> accept_ids;
  vector<unsigned int> accepting_states;
  pair< multimap<unsigned int, unsigned int>::iterator, multimap<unsigned int,unsigned int>::iterator > ret;
  multimap<unsigned int, unsigned int>::iterator itv;
  unsigned int cur_state, next_state, sym, n_acc;
  unsigned int num_filled = 0, trans_tab_sz = 0;
  unsigned int i, j;

  ifstream nfa_file;
  nfa_file.open(f_nfa);
  if (!nfa_file) {
    cerr << "File could not be openned" << endl;
    exit(1);
  }   

  /* read num of states and num of accept ids (signature ids) */
  nfa_file >> num_states >> num_accept_ids;
  for (i = 0; i < num_accept_ids; i++) {
    nfa_file >> tmp_st >> tmp_acc_id;
    accept_ids.insert(pair<int, int>(tmp_st, tmp_acc_id));
  }
  
  /* read the accepting states */
  nfa_file >> num_accept_states;
  for (i = 0; i < num_accept_states; i++) {
    nfa_file >> tmps;
    //accepting_states.push_back(tmps);
  }

  /* read the start states (may be multiple for NFA) */
  nfa_file >> num_starts;
  if (num_starts == 0) {
    cerr << "Invalid number of starts" << endl;
    exit(1);
  }
  for (i = 0; i < num_starts; i++) {
    nfa_file >> tmps;
    starts.insert(tmps);
  }

  /* read transition */
  nfa_file >> num_transitions;  

  trans_tab = new multimap<unsigned char, unsigned int> [num_states];

  for (i = 0; i < num_transitions; i++) {
    /* read one transition */
    nfa_file >> cur_state >> sym >> next_state;
    trans_tab[cur_state].insert(pair<unsigned char, unsigned int>(sym, next_state));
  }
  nfa_file.close();

  /* measure the size of transition table */
  for (i = 0; i < num_states; i++) {
    trans_tab_sz += sizeof(trans_tab[i]) + (sizeof(unsigned char) + sizeof(unsigned int)) * trans_tab[i].size();
  }

  cout << "Memory usage of trans tab " << trans_tab_sz << " bytes" << endl;

  /* fill out the accepting id's table */
  acc_ids = new unsigned int *[num_states];

  for (i = 0; i < num_states; i++) {
    j = 0;
    n_acc = 0;
    ret = accept_ids.equal_range(i);
    if (ret.second != ret.first) {
      /* get the number of acc ids before allocating space */	
      for (itv = ret.first; itv != ret.second; ++itv) {
	n_acc++;
      }
      acc_ids[i] = new unsigned int[n_acc + 1];
      for (itv = ret.first; itv != ret.second; ++itv) {
        acc_ids[i][j++] = itv->second;
      }
      acc_ids[i][j] = -1;
    } else {
      acc_ids[i] = 0;
    }
  }


}
/*
void kgram_nfa::construct_digram_trans() {
  unsigned int i, j;
  unsigned int s1, s2;
  set<unsigned int> tmp_next_st, tmp_sig_ids;
  unsigned char cur_digram[3];
  string cur_dg;

  cur_digram[2] = '\0';

  cout << "construct_digram_trans() num_states = " << num_states << endl; 
 
  for (i = 0; i < num_states; i++) {
    map<string, pair< set<unsigned int>, set<unsigned int> > > cur_trans;
    for (s1 = 0; s1 < ALPHABET_SIZE; s1++) {
      cur_digram[0] = (unsigned char)s1;
      for (s2 = 0; s2 < ALPHABET_SIZE; s2++) {
        cur_digram[1] = (unsigned char)s2;
        delta(i, cur_digram, 2, tmp_next_st, tmp_sig_ids);
	if (!tmp_next_st.empty() || !tmp_sig_ids.empty()) {
	  cur_dg.append(1, s1); 
	  cur_dg.append(1, s2); 
	  cur_trans.insert(pair<string, pair< set<unsigned int>, set<unsigned int> > >(cur_dg, pair< set<unsigned int>, set<unsigned int> >(tmp_next_st, tmp_sig_ids)));
	}
	tmp_next_st.clear();
	tmp_sig_ids.clear();
	cur_dg.clear();
      } 
    } 
    ktrans_tab.push_back(cur_trans);
  }
}
*/
void kgram_nfa::construct_digram_trans2() {
  unsigned int i, j;
  unsigned int s1, s2;
  set<unsigned int> tmp_next_st, tmp_sig_ids;
  set<unsigned int> *p_sids;
  unsigned char cur_digram[3];
  string cur_dg;
  size_t di_trans_tab_sz = 0;

  num_di_trans = 0;
  cur_digram[2] = '\0';

  cout << "construct_digram_trans() num_states = " << num_states << endl; 
 
  for (i = 0; i < num_states; i++) {
#ifdef TRACE_DIGRAM_RDT
  cout << "Processing state " << i << " of " << num_states << endl;
#endif
    map<string, pair< set<unsigned int>, set<unsigned int>* > > cur_trans;
    for (s1 = 0; s1 < ALPHABET_SIZE; s1++) {
      cur_digram[0] = (unsigned char)s1;
      for (s2 = 0; s2 < ALPHABET_SIZE; s2++) {
        cur_digram[1] = (unsigned char)s2;
        delta(i, cur_digram, 2, tmp_next_st, tmp_sig_ids);
	if (!tmp_next_st.empty() || !tmp_sig_ids.empty()) {
	  cur_dg.append(1, (unsigned char)s1); 
	  cur_dg.append(1, (unsigned char)s2);
 	  di_trans_tab_sz += sizeof(string) + 2;
	  di_trans_tab_sz += sizeof(tmp_next_st);
	  if (!tmp_next_st.empty()) {
	    di_trans_tab_sz += tmp_next_st.size() * sizeof(unsigned int);
	    num_di_trans += tmp_next_st.size();
	  }
	  di_trans_tab_sz += sizeof(set<unsigned int> *);
	  if (!tmp_sig_ids.empty()) {
	    p_sids = new set<unsigned int>;
	    p_sids->insert(tmp_sig_ids.begin(), tmp_sig_ids.end());
	    di_trans_tab_sz += sizeof(tmp_sig_ids) + tmp_sig_ids.size() * sizeof(unsigned int);
	  } else 
	    p_sids = NULL; 
	  cur_trans.insert(pair<string, pair< set<unsigned int>, set<unsigned int>* > >(cur_dg, pair< set<unsigned int>, set<unsigned int>* >(tmp_next_st, p_sids)));
	  //num_di_trans++;
	  di_trans_tab_sz += sizeof(pair<string, pair< set<unsigned int>, set<unsigned int>* > >);
	}
	tmp_next_st.clear();
	tmp_sig_ids.clear();
	cur_dg.clear();
      } 
    } 
    ktrans_tab.push_back(cur_trans);
  }
  cout << "Memory cost to keep the digram transition table is " << di_trans_tab_sz << " bytes!" << endl;
  cout << "Number of digrams transitions is: " << num_di_trans << endl;
}


void kgram_nfa::construct_multi_byte_trans(unsigned char kgram_size) {
  switch(kgram_size) {
    case 1:
	cerr << "kgram_size should be at least 2!" << endl;
	break;
    case 2: 
	kgram_nfa::construct_digram_trans2();
	break;
    case 3:
	kgram_nfa::construct_trigram_trans();
	break;
    default:
	cerr << "Gram size larger than 3 is not supported!" << endl;
	break;
  }
}

void kgram_nfa::construct_trigram_trans() {
  unsigned int i, j, k;
  unsigned int s1, s2, s3;
  set<unsigned int> tmp_next_st, tmp_sig_ids;
  set<unsigned int> *p_sids;
  unsigned char cur_digram[4];
  string cur_dg;

  cur_digram[3] = '\0';
  cout << "construct_trigram_trans() num_states = " << num_states << endl; 
 
  for (i = 0; i < num_states; i++) {
    map<string, pair< set<unsigned int>, set<unsigned int>* > > cur_trans;
    for (s1 = 0; s1 < ALPHABET_SIZE; s1++) {
      cur_digram[0] = (unsigned char)s1;
      for (s2 = 0; s2 < ALPHABET_SIZE; s2++) {
        cur_digram[1] = (unsigned char)s2;
	for (s3 = 0; s3 < ALPHABET_SIZE; s3++) {
          cur_digram[2] = (unsigned char)s3;
          delta(i, cur_digram, 3, tmp_next_st, tmp_sig_ids);
  	  if (!tmp_next_st.empty() || !tmp_sig_ids.empty()) {
	    cur_dg.append(1, s1); 
	    cur_dg.append(1, s2); 
	    cur_dg.append(1, s3);
	    if (!tmp_sig_ids.empty()) {
	      p_sids = new set<unsigned int>;
	      p_sids->insert(tmp_sig_ids.begin(), tmp_sig_ids.end());
	    } else
	      p_sids = NULL;
 	    cur_trans.insert(pair<string, pair< set<unsigned int>, set<unsigned int>* > >(cur_dg, pair< set<unsigned int>, set<unsigned int>* >(tmp_next_st, p_sids)));
	  }
	  tmp_next_st.clear();
	  tmp_sig_ids.clear();
	  cur_dg.clear();
	} 
      } 
    } 
    ktrans_tab.push_back(cur_trans);
  }  
}

void kgram_nfa::delta(unsigned int state, 
	       	      unsigned char sym, 
	       	      set<unsigned int>& s_next) {

  multimap<unsigned char, unsigned int>::iterator it;
  pair< multimap<unsigned char, unsigned int>::iterator, multimap<unsigned char, unsigned int>::iterator > ret;

  ret = trans_tab[state].equal_range(sym);
  for (it = ret.first; it != ret.second; ++it) {
#ifdef DEBUG
  cout << "delta() state " << state << " it->second = " << it->second << endl;
#endif
    s_next.insert(it->second);
  }
}

void kgram_nfa::delta(unsigned int cur_st, 
		      unsigned char *kgram, 
		      unsigned char len, 
	       	      set<unsigned int> &next_st,
		      set<unsigned int> &sig_ids) {

  unsigned int i, j; 
  set<unsigned int> frontiers, next_frontiers;
  set<unsigned int>::iterator it, it2, it_debug;
  unsigned char cur_sym;

  frontiers.insert(cur_st);

  for (i = 0; i < len; i++) {
    cur_sym = kgram[i];
    it = frontiers.begin();
    while (it != frontiers.end()) {
      delta(*it, cur_sym, next_frontiers);
      it++;
    }
    frontiers = next_frontiers;
    /* check for acceptance */
    for (it2 = frontiers.begin(); it2 != frontiers.end(); it2++) {
      if (acc_ids[*it2]) {
	j = 0;
        while (acc_ids[*it2][j] != -1) {
	  sig_ids.insert(acc_ids[*it2][j]);
	  j++;
	}
      }
    } 
    next_frontiers.clear();
  }
  
  next_st.insert(frontiers.begin(), frontiers.end());

}
/*
void kgram_nfa::simulate(const unsigned char *buf, unsigned int len, unsigned char kgram_size) {
  set<unsigned int> cur_st, next_st, sig_ids;
  set<unsigned int>::iterator it, it2;
  unsigned int i, j, k, r;
  map<string, pair< set<unsigned int>,  set<unsigned int> > >::iterator itm;
  multimap<unsigned char, unsigned int>::iterator itm2;
  pair< multimap<unsigned char, unsigned int>::iterator, multimap<unsigned char, unsigned int>::iterator > ret;

  r = (len - 1) % (unsigned int)kgram_size;

  cur_st = starts;
  
  for (i = 0; i < len - r - 1; i = i + kgram_size) {
    string cur_dg((const char*)(buf + i), (size_t)kgram_size);
    it = cur_st.begin();
    while (it != cur_st.end()) {
      itm = ktrans_tab[*it].find(cur_dg);
      if (itm != ktrans_tab[*it].end()) {
	next_st.insert(itm->second.first.begin(), itm->second.first.end());
	if (!itm->second.second.empty()) {
	  cout << "Matched at offset " << i << " sig id ";
	  for (it2 = itm->second.second.begin(); it2 != itm->second.second.end(); it2++)
	    cout << *it2 << " ";
	  cout << endl;
	}
      }
      it++;
    }

    cur_st = next_st;
    next_st.clear();
  }
  
  // process the remainder 
  for (k = i; k < len; k++) {
    it = cur_st.begin();
    while (it != cur_st.end()) {
      ret = trans_tab[*it].equal_range(buf[k]);
      for (itm2 = ret.first; itm2 != ret.second; ++itm2) {
        next_st.insert(itm2->second);
      }
      it++;
    }
    // check for acceptance/
    for (it = next_st.begin(); it != next_st.end(); it++) {
      if (this->acc_ids[*it]) {
        j = 0;
        printf("Matched at offset %u: ", k);
        while (this->acc_ids[*it][j] != -1) {
          printf("%d ", this->acc_ids[*it][j]);
          j++;
        }
        printf("\n"); 
      }
    }

      cur_st = next_st;
      next_st.clear();  
  }

}
*/
/*
void kgram_nfa::simulate(string buf, unsigned char kgram_size) {
  set<unsigned int> cur_st, next_st, sig_ids;
  set<unsigned int>::iterator it, it2;
  unsigned int i, j, k, r, len;
  map<string, pair< set<unsigned int>,  set<unsigned int> > >::iterator itm;
  multimap<unsigned char, unsigned int>::iterator itm2;
  pair< multimap<unsigned char, unsigned int>::iterator, multimap<unsigned char, unsigned int>::iterator > ret;

  len = buf.length();
  r = len % (unsigned int)kgram_size;

  cur_st = starts;
  
  for (i = 0; i < len - r; i = i + kgram_size) {
    it = cur_st.begin();
    while (it != cur_st.end()) {
      itm = ktrans_tab[*it].find(buf.substr(i, (size_t)kgram_size));
      if (itm != ktrans_tab[*it].end()) {
	next_st.insert(itm->second.first.begin(), itm->second.first.end());
	if (!itm->second.second.empty()) {
	  cout << "Matched at offset " << i << " sig id ";
	  for (it2 = itm->second.second.begin(); it2 != itm->second.second.end(); it2++)
	    cout << *it2 << " ";
	  cout << endl;
	}
      }
      it++;
    }

    cur_st = next_st;
    next_st.clear();
  }
  
  // process the remainder 
  for (k = i; k < len; k++) {
    it = cur_st.begin();
    while (it != cur_st.end()) {
      ret = trans_tab[*it].equal_range(buf.at(k));
      for (itm2 = ret.first; itm2 != ret.second; ++itm2) {
        next_st.insert(itm2->second);
      }
      it++;
    }
    // check for acceptance
    for (it = next_st.begin(); it != next_st.end(); it++) {
      if (this->acc_ids[*it]) {
        j = 0;
        printf("Matched at offset %u: ", k);
        while (this->acc_ids[*it][j] != -1) {
          printf("%d ", this->acc_ids[*it][j]);
          j++;
        }
        printf("\n"); 
      }
    }

      cur_st = next_st;
      next_st.clear();  
  }

} 
*/
void kgram_nfa::simulate2(const unsigned char *buf, unsigned int len, unsigned char kgram_size) {
  set<unsigned int> cur_st, next_st, sig_ids;
  set<unsigned int>::iterator it, it2;
  unsigned int i, j, k, r;
  //map<string, pair< set<unsigned int>,  set<unsigned int> > >::iterator itm;
  map<string, pair< set<unsigned int>,  set<unsigned int>* > >::iterator itm;
  multimap<unsigned char, unsigned int>::iterator itm2;
  pair< multimap<unsigned char, unsigned int>::iterator, multimap<unsigned char, unsigned int>::iterator > ret;

  r = (len - 1) % (unsigned int)kgram_size;

  cur_st = starts;
  
  for (i = 0; i < len - r - 1; i = i + kgram_size) {
    string cur_dg((const char*)(buf + i), (size_t)kgram_size);
    it = cur_st.begin();
    while (it != cur_st.end()) {
      itm = ktrans_tab[*it].find(cur_dg);
      if (itm != ktrans_tab[*it].end()) {
	next_st.insert(itm->second.first.begin(), itm->second.first.end());
	//if (!itm->second.second.empty()) {
	if (itm->second.second != NULL) {
	  cout << "Matched at offset " << i << " sig id ";
	  for (it2 = (itm->second.second)->begin(); it2 != (itm->second.second)->end(); it2++)
	    cout << *it2 << " ";
	  cout << endl;
	}
      }
      it++;
    }

    cur_st = next_st;
    next_st.clear();
  }
  
  // process the remainder 
  for (k = i; k < len; k++) {
    it = cur_st.begin();
    while (it != cur_st.end()) {
      ret = trans_tab[*it].equal_range(buf[k]);
      for (itm2 = ret.first; itm2 != ret.second; ++itm2) {
        next_st.insert(itm2->second);
      }
      it++;
    }
    // check for acceptance/
    for (it = next_st.begin(); it != next_st.end(); it++) {
      if (this->acc_ids[*it]) {
        j = 0;
        printf("Matched at offset %u: ", k);
        while (this->acc_ids[*it][j] != -1) {
          printf("%d ", this->acc_ids[*it][j]);
          j++;
        }
        printf("\n"); 
      }
    }

      cur_st = next_st;
      next_st.clear();  
  }

}


void kgram_nfa::output_kgram(char *f_kgram) {
  ofstream out_file;
  set<unsigned int>::iterator it;
  map<string, pair< set<unsigned int>,  set<unsigned int>* > >::iterator itm;
  unsigned int i;

  out_file.open(f_kgram);
  out_file << num_states << endl;
  out_file << starts.size() << " ";
  for (it = starts.begin(); it != starts.end(); it++) {
    out_file << *it << " ";
  }
  out_file << endl;

  for (i = 0; i < num_states; i++) {
    itm = ktrans_tab[i].begin();
    while (itm != ktrans_tab[i].end()) {
      out_file << itm->first << " ";
      it = itm->second.first.begin();
      while (it != itm->second.first.end()) {
	out_file << *it << " ";
	it++;
      }
      //if (!itm->second.second.empty()) {
      if (itm->second.second) {
	out_file << " sig_id ";
        it = itm->second.second->begin();
        while (it != itm->second.second->end()) {
	  out_file << *it << " ";
	  it++;
        }
      }
      out_file << endl;
      itm++;
    }
  }
  out_file.close();

}


void kgram_nfa::init_alphabet_map(unsigned char k) {
  unsigned int s1, s2;
  string cur_dg;

  for (s1 = 0; s1 < ALPHABET_SIZE; s1++) {
    for (s2 = 0; s2 < ALPHABET_SIZE; s2++) {
      cur_dg.append(1, (unsigned char)s1);
      cur_dg.append(1, (unsigned char)s2);
      class_c.insert(pair<string, unsigned int>(cur_dg, 0));
      digrams.push_back(cur_dg);
      cur_dg.clear();
    }
  }
}

void kgram_nfa::alphabet_reduction() {
  int max_class = 0;
  unsigned int s;
  set< pair<set<unsigned int>,  set<unsigned int> > > target_sets;
  set<unsigned int> empty_set;
  map< string, pair< set<unsigned int>,  set<unsigned int>* > >::iterator itm;
  set< pair<set<unsigned int>,  set<unsigned int> > >::iterator it;
  map<string, unsigned int>::iterator itm2;
  int i, j;
  map<string, bool> kgram_covered;
  map<string, bool> class_covered;
  map<string, unsigned int> remap;
  
  /* initialize the alphabet mapping: all kgrams map to 0 */
  init_alphabet_map(2);

#ifdef TRACE_ALPHA_RDT
  cout << "ktrans_tab.size() " << ktrans_tab.size() << endl;
  cout << "finished initializing alphabet mapping" << endl;
#endif

  /* construct the target sets */
  for (i = 0; i < ktrans_tab.size(); i++) {
    for (itm = ktrans_tab[i].begin(); itm != ktrans_tab[i].end(); itm++) {
      if (itm->second.second)
        target_sets.insert(pair<set<unsigned int>,  set<unsigned int> >(itm->second.first, *(itm->second.second)));
      else 
        target_sets.insert(pair<set<unsigned int>,  set<unsigned int> >(itm->second.first, empty_set));
    }
  }
#ifdef TRACE_ALPHA_RDT
  cout << "finished constructing target sets" << endl;
  cout << "number of target sets " << target_sets.size() << endl;
#endif

  /* NOTE: num_states = ktrans_tab.size() */
  for (s = 0; s < num_states - 1; s++) {
#ifdef TRACE_ALPHA_RDT
  cout << "processing state " << s << " ..." << endl;
#endif
    for (it = target_sets.begin(); it != target_sets.end(); it++) {
      /* set kgram_covered and class_covered to false */
      kgram_covered.clear();
      init_kgram_covered(kgram_covered);
      class_covered.clear();
      init_class_covered(class_covered);
      /* map all kgrams to 0 */
      remap.clear();
      init_remap(remap);
      int on_zero = 0;
      //for (itm2 = class_c.begin(); itm2 != class_c.end(); itm2++) {
      for (i = 0; i < digrams.size(); i++) {
        //if ((unsigned char)c != 0) {
	  itm = ktrans_tab[s].find(digrams[i]);
	  if (itm->second.first == it->first) {
	    if (!itm->second.second) {
	      if (it->second.empty()) {
 	        kgram_covered[digrams[i]] = true;
                if (class_c[digrams[i]] == 0) {
	          if (on_zero == 0) {
	            on_zero = ++max_class;
	          }
	          class_c[digrams[i]] = on_zero;
	        } else
	          class_covered[digrams[class_c[digrams[i]]]] = true;		
	      }
	    } else if (*(itm->second.second) == it->second) {
 	        kgram_covered[digrams[i]] = true;
                if (class_c[digrams[i]] == 0) {
	          if (on_zero == 0) {
	            on_zero = ++max_class;
	          }
	          class_c[digrams[i]] = on_zero;
	        } else
	          class_covered[digrams[class_c[digrams[i]]]] = true;
 	    }
	  } 
	//}
	
      }
      for (i = 0; i < digrams.size(); i++) {
      //for (itm2 = class_c.begin(); itm2 != class_c.end(); itm2++) {
	if (!kgram_covered[digrams[i]] && class_covered[digrams[class_c[digrams[i]]]]) {
	  if (remap[digrams[class_c[digrams[i]]]] == 0) {
	    remap[digrams[class_c[digrams[i]]]] = ++max_class;
	  }
	  class_c[digrams[i]] = remap[digrams[class_c[digrams[i]]]];
	}
      }
    }
  }
#ifdef TRACE_ALPHA_RDT
  cout << "finished reduction" << endl;
  cout << " max_class = " << max_class << endl;
#endif
}

/* An improved alphabet reduction: this implementation leverages the locality
 * of transitions in NFA */
void kgram_nfa::alphabet_reduction_im() {
  int max_class = 0;
  unsigned int s;
  set< pair<set<unsigned int>,  set<unsigned int> > > *target_sets;
  set<unsigned int> empty_set;
  map< string, pair< set<unsigned int>,  set<unsigned int>* > >::iterator itm;
  vector<string> *digram_set;
  set< pair<set<unsigned int>,  set<unsigned int> > >::iterator it;
  map<string, unsigned int>::iterator itm2;
  int i, j;
  map<string, bool> kgram_covered;
  map<string, bool> class_covered;
  map<string, unsigned int> remap;
  size_t ar_mem_cost = 0; 
 
  /* initialize the alphabet mapping: all kgrams map to 0 */
  init_alphabet_map(2);
  ar_mem_cost += sizeof(map<string, unsigned int>);
  ar_mem_cost += (sizeof(string) + 2 + sizeof(unsigned int)) * ALPHABET_SIZE * ALPHABET_SIZE;

#ifdef TRACE_ALPHA_RDT
  cout << "ktrans_tab.size() " << ktrans_tab.size() << endl;
  cout << "finished initializing alphabet mapping" << endl;
#endif

  target_sets = new set< pair<set<unsigned int>,  set<unsigned int> > > [num_states - 1];
  digram_set = new vector<string> [num_states - 1];

  ar_mem_cost += sizeof(set< pair<set<unsigned int>,  set<unsigned int> > >) * (num_states - 1);
  ar_mem_cost += sizeof(vector<string>) * (num_states - 1);

  /* construct the target sets */
  for (i = 0; i < ktrans_tab.size(); i++) {
    for (itm = ktrans_tab[i].begin(); itm != ktrans_tab[i].end(); itm++) {
      digram_set[i].push_back(itm->first);
      ar_mem_cost += sizeof(string) + 2;
      if (itm->second.second) {
        target_sets[i].insert(pair<set<unsigned int>,  set<unsigned int> >(itm->second.first, *(itm->second.second)));
 	ar_mem_cost += sizeof(pair<set<unsigned int>,  set<unsigned int> >) + sizeof(set<unsigned int>) + itm->second.first.size() * sizeof(unsigned int) + sizeof(set<unsigned int>) + itm->second.second->size() * sizeof(unsigned int);
      }
      else {
        target_sets[i].insert(pair<set<unsigned int>,  set<unsigned int> >(itm->second.first, empty_set));
  	ar_mem_cost += sizeof(pair<set<unsigned int>,  set<unsigned int> >) + sizeof(set<unsigned int>) + itm->second.first.size() * sizeof(unsigned int) + sizeof(set<unsigned int>);
      }
    }
  }
#ifdef TRACE_ALPHA_RDT
  cout << "finished constructing target sets" << endl;
#endif

  /* memory cost for init_kgram_covered(), init_class_covered(), and init_remap() */
  ar_mem_cost += sizeof(map<string, bool>) + (sizeof(string) + 2 + sizeof(bool)) * ALPHABET_SIZE * ALPHABET_SIZE;
  ar_mem_cost += sizeof(map<string, bool>) + (sizeof(string) + 2 + sizeof(bool)) * ALPHABET_SIZE * ALPHABET_SIZE;
  ar_mem_cost += sizeof(map<string, unsigned int>) + (sizeof(string) + 2 + sizeof(unsigned int)) * ALPHABET_SIZE * ALPHABET_SIZE;
  /* NOTE: num_states = ktrans_tab.size() */
  for (s = 0; s < num_states - 1; s++) {
#ifdef TRACE_ALPHA_RDT
  cout << "processing state " << s << " ..." << endl;
#endif
    for (it = target_sets[s].begin(); it != target_sets[s].end(); it++) {
      /* set kgram_covered and class_covered to false */
      kgram_covered.clear();
      init_kgram_covered(kgram_covered);
      class_covered.clear();
      init_class_covered(class_covered);
      /* map all kgrams to 0 */
      remap.clear();
      init_remap(remap);
      int on_zero = 0;

      for (i = 0; i < digram_set[s].size(); i++) {
	  itm = ktrans_tab[s].find(digram_set[s][i]);
	  if (itm->second.first == it->first) {
	    if (!itm->second.second) {
	      if (it->second.empty()) {
 	        kgram_covered[digram_set[s][i]] = true;
                if (class_c[digram_set[s][i]] == 0) {
	          if (on_zero == 0) {
	            on_zero = ++max_class;
	          }
	          class_c[digram_set[s][i]] = on_zero;
	        } else
	          class_covered[digrams[class_c[digram_set[s][i]]]] = true;		
	      }
	    } else if (*(itm->second.second) == it->second) {
 	        kgram_covered[digram_set[s][i]] = true;
                if (class_c[digram_set[s][i]] == 0) {
	          if (on_zero == 0) {
	            on_zero = ++max_class;
	          }
	          class_c[digram_set[s][i]] = on_zero;
	        } else
	          class_covered[digrams[class_c[digram_set[s][i]]]] = true;
 	    }
	  } 
      }
      for (i = 0; i < digrams.size(); i++) {
	if (!kgram_covered[digrams[i]] && class_covered[digrams[class_c[digrams[i]]]]) {
	  if (remap[digrams[class_c[digrams[i]]]] == 0) {
	    remap[digrams[class_c[digrams[i]]]] = ++max_class;
	  }
	  class_c[digrams[i]] = remap[digrams[class_c[digrams[i]]]];
	}
      }
    }
  }
  cout << "Memory cost for alphabet_reduction_im() is: " << ar_mem_cost << " bytes." << endl;
#ifdef TRACE_ALPHA_RDT
  cout << "finished reduction" << endl;
  cout << " max_class = " << max_class << endl;
#endif
}



void kgram_nfa::rdt_trans_tab_gen() {
  unsigned int i;
  map<string, pair< set<unsigned int>,  set<unsigned int>* > >::iterator itm;
  size_t trans_gen_mem_cost = 0;

  rdt_ktrans_tab = new map<unsigned int, pair< set<unsigned int>,  set<unsigned int>* > > [num_states];
  trans_gen_mem_cost += sizeof(map<unsigned int, pair< set<unsigned int>,  set<unsigned int>* > >) * num_states;

  num_rdt_trans = 0;
#ifdef TRACE_RDT_TRANS_GEN
  cout << "rdt_trans_tab_gen(): num_states = " << num_states << endl;
#endif
  for (i = 0; i < num_states; i++) {
#ifdef TRACE_RDT_TRANS_GEN
  cout << "rdt_trans_tab_gen(): i = " << i << endl;
#endif
    for (itm = ktrans_tab[i].begin(); itm != ktrans_tab[i].end(); itm++) {
      if (rdt_ktrans_tab[i].find(class_c[itm->first]) == rdt_ktrans_tab[i].end()) {
	rdt_ktrans_tab[i].insert(pair<unsigned int, pair< set<unsigned int>,  set<unsigned int>* > >(class_c[itm->first], itm->second));
	trans_gen_mem_cost += sizeof(pair<unsigned int, pair< set<unsigned int>,  set<unsigned int>* > >) + sizeof(unsigned int) + sizeof(pair< set<unsigned int>,  set<unsigned int>* >) + sizeof(set<unsigned int>) + itm->second.first.size() * sizeof(unsigned int) + sizeof(set<unsigned int>*); 
	//num_rdt_trans += itm->second.first.size();	 
	num_rdt_trans++;	/* this is not real num of transitions, but for output */	 
      }
    }
  }
  cout << "After alphbet reduction, number of transitions: " << num_rdt_trans << endl;
  cout << "Memory cost for rdt_trans_tab_gen() is: " << trans_gen_mem_cost << " bytes." << endl;
}

void kgram_nfa::dump_rdt_transitions(char *f) {
  ofstream out_file;
  set<unsigned int>::iterator it;
  map<unsigned int, pair< set<unsigned int>,  set<unsigned int>* > >::iterator itm;
  unsigned int i;
  map<string, unsigned int>::iterator itm2;
  multimap<unsigned int, unsigned int>::iterator itm3;

  out_file.open(f);
  out_file << num_states << endl;

  /* output digram mapping table */
  out_file << class_c.size() << endl;	/* size of the alphabet */
  for (itm2 = class_c.begin(); itm2 != class_c.end(); itm2++) {
    //out_file << itm2->first << " " << itm2->second << endl;
    out_file << (int)(unsigned char)(itm2->first[0]) << " " << (unsigned int)(unsigned char)(itm2->first[1]) << " " << itm2->second << endl;
  }

  /* output the accept ids */
  out_file << accept_ids.size() << endl;
  for (itm3 = accept_ids.begin(); itm3 != accept_ids.end(); itm3++) {
    out_file << itm3->first << " " << itm3->second << endl;
  }

  out_file << starts.size() << " ";
  for (it = starts.begin(); it != starts.end(); it++) {
    out_file << *it << " ";
  }
  out_file << endl;

  out_file << num_rdt_trans << endl;	/* number of reduced transitions */
  for (i = 0; i < num_states; i++) {
    itm = rdt_ktrans_tab[i].begin();
    while (itm != rdt_ktrans_tab[i].end()) {
      /* output cur state number and digram */ 
      out_file << i << " " << itm->first << " ";
      out_file << itm->second.first.size() << " "; /* number of next set of states */
      it = itm->second.first.begin();
      while (it != itm->second.first.end()) {
	out_file << *it << " ";
	it++;
      }
      if (itm->second.second) {
        out_file << itm->second.second->size() << " ";	/* number of sig ids */
	out_file << " sig_id ";
        it = itm->second.second->begin();
        while (it != itm->second.second->end()) {
	  out_file << *it << " ";
	  it++;
        }
      } else 
        out_file << 0;	/* no sig id */
	
      out_file << endl;
      itm++;
    }
  }
  out_file.close();
}



void kgram_nfa::rdt_simulate(const unsigned char *buf, unsigned int len) {
  set<unsigned int> cur_st, next_st, sig_ids;
  set<unsigned int>::iterator it, it2;
  unsigned int i, j, k, r;
  map<unsigned int, pair< set<unsigned int>,  set<unsigned int>* > >::iterator itm;
  multimap<unsigned char, unsigned int>::iterator itm2;
  pair< multimap<unsigned char, unsigned int>::iterator, multimap<unsigned char, unsigned int>::iterator > ret;

  //r = (len - 1) % (unsigned int)kgram_size;

  cur_st = starts;
  
  //for (i = 0; i < len - r - 1; i = i + kgram_size) {
  for (i = 0; i < len - 1; i = i + kgram_size) {
    string cur_dg((const char*)(buf + i), (size_t)kgram_size);
    it = cur_st.begin();
    while (it != cur_st.end()) {
      itm = rdt_ktrans_tab[*it].find(class_c[cur_dg]);
      if (itm != rdt_ktrans_tab[*it].end()) {
	next_st.insert(itm->second.first.begin(), itm->second.first.end());
	if (itm->second.second != NULL) {
	  cout << "Matched at offset " << i << " sig id ";
	  for (it2 = (itm->second.second)->begin(); it2 != (itm->second.second)->end(); it2++)
	    cout << *it2 << " ";
	  cout << endl;
	}
      }
      it++;
    }

    cur_st = next_st;
    next_st.clear();
  }
  
  /* process the remainder: THIS IS NO NEEDED ANY MORE (it was used to 
   * verify the correctness of the simulation) */ 
  /* 
  for (k = i; k < len; k++) {
#ifdef TRACE_ALPHA_RDT
  cout << "len = " << len << " in rdt_simulate(): one-gram transition table is used" << endl; 
#endif

    it = cur_st.begin();
    while (it != cur_st.end()) {
      ret = trans_tab[*it].equal_range(buf[k]);
      for (itm2 = ret.first; itm2 != ret.second; ++itm2) {
        next_st.insert(itm2->second);
      }
      it++;
    }
    // check for acceptance/
    for (it = next_st.begin(); it != next_st.end(); it++) {
      if (this->acc_ids[*it]) {
        j = 0;
        printf("Matched at offset %u: ", k);
        while (this->acc_ids[*it][j] != -1) {
          printf("%d ", this->acc_ids[*it][j]);
          j++;
        }
        printf("\n"); 
      }
    }

      cur_st = next_st;
      next_st.clear();  
  }
  */
}

/* this function use the mapped alphabet */
/*
void kgram_nfa::rdt_simulate2(unsigned int *buf, unsigned int len) {
  set<unsigned int> cur_st, next_st, sig_ids;
  set<unsigned int>::iterator it, it2;
  unsigned int i;
  map<unsigned int, pair< set<unsigned int>,  set<unsigned int>* > >::iterator itm;
  multimap<unsigned char, unsigned int>::iterator itm2;
  pair< multimap<unsigned char, unsigned int>::iterator, multimap<unsigned char, unsigned int>::iterator > ret;


  cur_st = starts;
  
  for (i = 0; i < len; i++) {
    it = cur_st.begin();
    while (it != cur_st.end()) {
      itm = rdt_ktrans_tab[*it].find(buf[i]);
      if (itm != rdt_ktrans_tab[*it].end()) {
	next_st.insert(itm->second.first.begin(), itm->second.first.end());
	if (itm->second.second != NULL) {
	  cout << "Matched at offset " << i << " sig id ";
	  for (it2 = (itm->second.second)->begin(); it2 != (itm->second.second)->end(); it2++)
	    cout << *it2 << " ";
	  cout << endl;
	}
      }
      it++;
    }

    cur_st = next_st;
    next_st.clear();
  }

} */

/* use pointer swap to improve efficiency */
void kgram_nfa::rdt_simulate2(unsigned int *buf, unsigned int len) {
  set<unsigned int> cur_st, next_st, sig_ids;
  set<unsigned int> *p_cur_st, *p_next_st, *p_tmp;
  set<unsigned int>::iterator it, it2;
  unsigned int i;
  map<unsigned int, pair< set<unsigned int>,  set<unsigned int>* > >::iterator itm;
  multimap<unsigned char, unsigned int>::iterator itm2;
  pair< multimap<unsigned char, unsigned int>::iterator, multimap<unsigned char, unsigned int>::iterator > ret;

  cur_st = starts;
  p_cur_st = &cur_st;
  p_next_st = &next_st;
 
  for (i = 0; i < len; i++) {
    it = p_cur_st->begin();
    while (it != p_cur_st->end()) {
      itm = rdt_ktrans_tab[*it].find(buf[i]);
      if (itm != rdt_ktrans_tab[*it].end()) {
	p_next_st->insert(itm->second.first.begin(), itm->second.first.end());
	if (itm->second.second != NULL) {
	  cout << "Matched at offset " << i << " sig id ";
	  for (it2 = (itm->second.second)->begin(); it2 != (itm->second.second)->end(); it2++)
	    cout << *it2 << " ";
	  cout << endl;
	}
      }
      it++;
    }

    p_tmp = p_cur_st;
    p_cur_st = p_next_st;
    p_next_st = p_tmp;
    p_next_st->clear();
  }

}


void kgram_nfa::fill_rdt_trans_tab(char *f) {

  unsigned int alphabet_size, num_starts, tmp_st;
  int i, j, num_sigs;
  string cur_dg, tmp_str;
  unsigned int dg0, dg1;
  unsigned int cur_mapped_dg, cur_st, cur_new_dg, num_next_st, tmp_next_st, tmp_num_sig_id, tmp_sig_id;
  unsigned int total_bytes = 0;
  set<unsigned int> tmp_next_st_set;
  set<unsigned int> *p_sig_id;
  ifstream nfa_file;

  nfa_file.open(f);
  if (!nfa_file) {
    cerr << "File could not be openned" << endl;
    exit(1);
  }

  nfa_file >> num_states;

  /* read the mapping table */
  nfa_file >> alphabet_size;
  for (i = 0; i < alphabet_size; i++) {
#ifdef TRACE_FILL_RDT
  cout << "processing mapping " << i << endl;
#endif
    nfa_file >> dg0 >> dg1 >> cur_mapped_dg;
    cur_dg.append(1, (unsigned char)dg0);
    cur_dg.append(1, (unsigned char)dg1);
    total_bytes += 2 * sizeof(unsigned char) + sizeof(string);
    total_bytes += sizeof(unsigned int);
    class_c.insert(pair<string, unsigned int>(cur_dg, cur_mapped_dg));
    total_bytes += sizeof(pair<string, unsigned int>);
    cur_dg.clear();
  }
  total_bytes += sizeof(map<string, unsigned int>);

  /* skip the signature ids (needed by Rezwana's program) */
  nfa_file >> num_sigs;
  for (i = 0; i < num_sigs; i++)
    nfa_file >> tmp_st >> tmp_sig_id;

  /* read the starting states */
  nfa_file >> num_starts;
  for (i = 0; i < num_starts; i++) {
    nfa_file >> tmp_st;
    starts.insert(tmp_st);
  }
  total_bytes += sizeof(set<unsigned int>) + starts.size() * sizeof(unsigned int);

  rdt_ktrans_tab = new map<unsigned int, pair< set<unsigned int>,  set<unsigned int>* > > [num_states];
  total_bytes += num_states * sizeof(map<unsigned int, pair< set<unsigned int>,  set<unsigned int>* > >);

  /* read transitions */
  nfa_file >> num_rdt_trans;
  total_bytes += sizeof(unsigned int); 
#ifdef TRACE_FILL_RDT
  cout << "number of transitions = " << num_rdt_trans << endl;
#endif
  for (i = 0; i < num_rdt_trans; i++) {
#ifdef TRACE_FILL_RDT
  if (i % 1000 == 0)
    cout << "processing transition " << i << endl;
#endif
    nfa_file >> cur_st >> cur_new_dg >> num_next_st;
    tmp_next_st_set.clear();
    total_bytes += sizeof(unsigned int); 
    total_bytes += sizeof(set<unsigned int>); 
  
    /* read next set of states if any */
    for (j = 0; j < num_next_st; j++) {
      nfa_file >> tmp_next_st;
      tmp_next_st_set.insert(tmp_next_st);
      total_bytes += sizeof(unsigned int); 
    }
    /* read sig id if any */
    nfa_file >> tmp_num_sig_id;
    total_bytes += sizeof(set<unsigned int>*); 

    if (tmp_num_sig_id > 0) {
      p_sig_id = new set<unsigned int>;
      total_bytes += sizeof(set<unsigned int>); 
    }
    else 
      p_sig_id = 0;
    for (j = 0; j < tmp_num_sig_id; j++) {
      nfa_file >> tmp_str >> tmp_sig_id;
      p_sig_id->insert(tmp_sig_id);
      total_bytes += sizeof(unsigned int); 
    }
    rdt_ktrans_tab[cur_st].insert(pair<unsigned int, pair< set<unsigned int>,  set<unsigned int>* > >(cur_new_dg, pair<set<unsigned int>,  set<unsigned int>* >(tmp_next_st_set, p_sig_id)));
    total_bytes += sizeof(pair<unsigned int, pair< set<unsigned int>,  set<unsigned int>* > >);
  }
  cout << "total memory cost for reducted transition table = " << total_bytes << " bytes" << endl; 
#ifdef TRACE_FILL_RDT
  cout << "fill_rdt_trans_tab completed" << endl;
#endif

}

/* ---------------------NON-MEMBER FUNCTIONS------------------- */

void init_kgram_covered(map<string, bool>& kg_covered) {
  unsigned int s1, s2;
  string cur_dg;

  for (s1 = 0; s1 < ALPHABET_SIZE; s1++) {
    for (s2 = 0; s2 < ALPHABET_SIZE; s2++) {
      cur_dg.append(1, (unsigned char)s1);
      cur_dg.append(1, (unsigned char)s2);
      kg_covered.insert(pair<string, unsigned int>(cur_dg, false));
      cur_dg.clear();
    }
  }
}

void init_class_covered(map<string, bool>& cl_covered) {
  unsigned int s1, s2;
  string cur_dg;

  for (s1 = 0; s1 < ALPHABET_SIZE; s1++) {
    for (s2 = 0; s2 < ALPHABET_SIZE; s2++) {
      cur_dg.append(1, (unsigned char)s1);
      cur_dg.append(1, (unsigned char)s2);
      cl_covered.insert(pair<string, unsigned int>(cur_dg, false));
      cur_dg.clear();
    }
  }
}

void init_remap(map<string, unsigned int>& rm) {
  unsigned int s1, s2;
  string cur_dg;

  for (s1 = 0; s1 < ALPHABET_SIZE; s1++) {
    for (s2 = 0; s2 < ALPHABET_SIZE; s2++) {
      cur_dg.append(1, (unsigned char)s1);
      cur_dg.append(1, (unsigned char)s2);
      rm.insert(pair<string, unsigned int>(cur_dg, 0));
      cur_dg.clear();
    }
  }
}

void kgram_nfa::load_kgram_mapping(char *f) {
  ifstream in_file;
  int i, alpha_size;
  unsigned int tmp_char0, tmp_char1, tmp_new_alpha;
  string cur_kgram;

  in_file.open(f);
  if (!in_file) {
    cout << "Failed to open file " << f << endl;
    exit(0);
  }

  in_file >> alpha_size;
  for (i = 0; i < alpha_size; i++) {
    in_file >> tmp_char0 >> tmp_char1 >> tmp_new_alpha;
#ifdef TRACE_ALPHA_LOADING
  cout << tmp_char0 << " " << tmp_char1 << " " << tmp_new_alpha << endl;
#endif
    cur_kgram.append(1, (unsigned char)tmp_char0); 
    cur_kgram.append(1, (unsigned char)tmp_char1);
    class_c.insert(pair<string, unsigned int>(cur_kgram, tmp_new_alpha));
    cur_kgram.clear();
  }
  in_file.close();
}
