/*---------------------------------------------------------------------------
 * File:	nfa-multimap.h
 *
 * Author:	Liu Yang
 * Date:	Jun 14, 2009
 *
 *-------------------------------------------------------------------------
 * $Log: nfa-multimap.h,v $
 * Revision 1.19  2010/09/27 14:21:07  lyangru
 * final synchronization
 *
 * Revision 1.1  2009/09/02 01:57:00  lyangru
 * startup
 *
 * Revision 1.6  2009/06/22 13:35:13  lyangru
 * Added start to multimap_nfa and modified the corresponding functions
 * */

#include <iostream>
#include <sys/time.h>
#include <sys/resource.h>


#define NUM_COLUMN	3
#define LINE_LEN	256
#define MAX_RE_LEN	4096
#define MAX_STRING_LEN	256
#define ALPHABET_SIZE	256


using namespace std;

class multimap_nfa {

  public:
    int num_states;
    int num_accept_ids;	/* number of accept id's (to indicate signature) */
    int num_transitions;
    int num_starts;	/* number of states in starts */
    bool eps_eliminated; /* whether epsilon transitions have been eliminated */
    set<int> starts;
    set<int> all_starts;	/* including the actual start states number (NFA is combined from multiple NFA's) and the added set of starts */
    /* start states of individual NFA's which were combined to a bigger NFA */
    vector<int> real_starts;
    /* the added start states during the do_alternation_acc() combination */
    vector<int> added_starts;
    /* accepting states of individual NFA's */
    vector<int> accepting_states;
    /* the added accepting states during NFA combination */
    vector<int> added_acc_sts;
    /* transition table, where the index of an element denotes state no and
     * an element (multimap object) denotes transitions leaving a state */
    vector< multimap<int, int> > transitions;
    multimap<int, int> accept_ids;	/* for signature id */
    /* a map between real starts and added starts; for each map entry,
     * there is an epsilon transition between the key and value */
    multimap<int, int> eps_starts_map;	
    /* the range of state numbers for individual NFA's */
    multimap<int, int> local_states_num_ranges;
    set<int> frontiers;
   
    

  public:

/*--------------- multimap_nfa :: default constructor -------------
 * Initialize all member variables */
    multimap_nfa();

/*--------------- multimap_nfa :: constructor ---------------
 * Create an NFA according to given parameters
 * Pre	n_states contains number of states
 * 	s_start is the start state
 * 	a_states contains the set of accepting states
 * 	tran contains transition table with entries of 
 * 	  <state, symbol, next state>
 * 	n_trans contains number os transitions  
 * Post	multimap_nfa is created
 * */
/* unused now */ 
    multimap_nfa(int n_states, int s_start, vector<int> a_states, int **trans, int n_transitions);

/*--------------- multimap_nfa :: constructor --------------- 
 * Create an NFA from a given text file
 * Pre	a text file with given format
 * 	eps_el indicates whether epsilon transitions have been eliminated
 * Post	multimap_nfa is created
 * */
    multimap_nfa(char *nfile, bool eps_el);

/*---------------multimap_nfa::find_reachable_starts-----------
 * Find the reachable starts by epsilon transition from a start state st
 * Pre	st is a start state
 * 	rs is empty
 * Post	epsilon reachable states are added into rs */
void find_reachable_starts(int st, set<int> &rs);

/*---------------multimap_nfa::delta ----------------------- 
 * Simulate the delta function in nfa
 * Pre	state denotes current state number
 * 	symbol denotes an imput symbol
 * 	s_next is an empty set
 * Post	a set of next states has been filled in s_next	
 * */
    void delta(int state, int symbol, set<int>& s_next);

/*---------------multimap_nfa::display_transitions----------
 * Dispaly the transition table of an NFA
 * Pre	an NFA object is created
 * Post	transtion table is displayed 
 * */
    void display_transitions();

/*-------------multimap_nfa::combine----------------------
 * Combine the current nfa with another nfa x 
 * Pre	an NFA object is created
 * 	x is another NFA
 * Post the current NFA is combined with x using union operation
 * */
/* UNUSED NOW */
    void combine(multimap_nfa *x);

/*-------------multimap_nfa::out_to_file-----------------
 * Output an nfa to text file 
 * Pre	an NFA object is created and f is a given file name
 * Post	the NFA object has been output to a file 
 * */
    void out_to_file(char *f);
    /* add an offset to all state numbers when outputing the nfa */
    void out_to_file(char *f, int st_offset);

/*-------------multimap_nfa::accept_or_not--------------
 * Check whether a string is accepted by this NFA
 * Pre	str is a string to be tested 
 * Post return true if str is accepted by the NFA
 *  	  otherwise return false
 * */
    bool accept_or_not(const char *str);

    bool accept_or_not_stream(const char *str);

/*------------multimap_nfa::is_accepting_state----------
 * Check whether a set contains one of the accepting state
 * Pre	s is a set of states
 * Post	return true if s contains one of the accepting state,
 *    	otherwise return false */
/*---UNUSED NOW since it provides very limited info---*/ 
    bool is_accepting_state(set<int> s);

/*--------------multimap_nfa::is_accepting-------------
 * Only check accept_ids and see if its intersection with s is nonempty
 * Pre	s is a set of states
 * Post	return true if s contains one of the accepting state
 * 	output the signature id being matched */
    bool is_accepting(set<int> s);

/*--------------multimap_nfa::epsilon_elimination-------
 * Eliminate epsilon transitions of an NFA 
 * Pre	NFA with epsilon transitions
 * Post	Epsilon transitions have been removed */
    /* has the least performance among the three */
    void epsilon_elimination();
    /* slightly better than epsilon_elimination() */
    void epsilon_elimination_improved();
    /* epsilon_elimination_im2 is VOID NOW!!!!*/
    void epsilon_elimination_im2();
    /* epsilon_elimination_im3 has the best performance 
     * it corrects a bug in epsilon_elimination_im2 */
    void epsilon_elimination_im3();

/*------------functions used in epsilon elimination ----------
 * */
    /* ---multimap_nfa::is_start-------
     * checks whether a state is a member of the all_starts states
     * Pre 	s is a state number
     * Post	returns true if s belongs to all_starts 
     * 		otherwise returns false*/
    bool is_start(int s);
    /* checks whether state s belongs to real_starts */
    bool is_real_accept(int s);
    /* find a state which has an epsilon transition to state s in case
     * that s belongs to all_starts */
    int get_eps_to_start(int s);
    /* do a local search (only search the state numbers belonging to
     * an individual NFA) for states which have transitions to state s 
     * Pre	s is a state no
     * 		states_sym is empty
     * Post	<sta, sym> pairs are copied to states_sym, where the NFA
     * goes from state sta to state s with input sym */
    void local_search_eps_from(int s, multimap<int, int>& states_sym);
    /* gets the local range state numbers for s
     * Pre	s is a state number
     * Post	start and end have been assigned values */
    void get_local_range(const int s, int& start, int& end);
    /* checks whether s is an added accepting state */
    bool is_added_acc_state(int s);
    /* obtains the set of states which are reachable by s through 
     * eps transition */
    void get_local_eps_reachable(int s, set<int>& eps_r);
    /* checks whether s is an added start state during NFA combination */
    bool is_added_start_state(int s);
    int get_closest_added_start(int st);

};


/*--------------Non-member functions---------------*/

/*--------------display_map-------------------------
 * Display the key value pairs of a multimap
 * Pre	p is a multimap
 * Post	key value pairs are displayed 
 * */
void display_map(multimap<int, int> p);

/*--------------offset_state-----------------------
 * Shift the state number of a multimap by offset 
 * Pre	p is a multimap and offset is an unsigned integer 
 *   	  to be added to the second fields of p
 * Post	the values of second fields of p are increased by offset
 * */
void offset_state(multimap<int, int>& p, int offset);

/*-------------regex_combination------------------
 * Combine multiple regexes to be one using union operation
 * Pre	refile is the name of a file which contains multiple regexes
 * 	  each regex occupies one line in the file
 * Post	all regexes in refile are combined to one regex, i.e., obj_regex 
 * Problem: each regex should be encapsulated by a pair of parenthesis,
 * 	otherwise the combined regex is not equivalent to the union of 
 * 	all regexes */
 /* unused now */
void regex_combination(const char *refile, char *obj_regex);

 /* unused now */
void remove_slash(char *dst_str, const char *src_str);

 /* unused now */
void add_bar(char *s);

/*-------------generate_test_strings--------------
 * Generate strings to be used for NFA/DFA/BDD match testing
 * Pre	src - a base string
 * 	alphabet - characters which can be used to generate string 
 * 	pos_to_insert - a position where we want to insert characters
 * 	num_str - number of strings to be generated
 * Post outfile - the generated strings are written to outfile */
void generate_test_strings(const char *src, const char *alphabet, size_t pos_to_insert, size_t num_str, char *outfile);

/*--------------rand_string------------------------
 * Generate a random string of a certain length from given alphabet
 * Pre	alphabet - characters to be used
 * 	len - size of the string to be generated
 * Post obj_str - the generated string */
void rand_string(const char *alphabet, const size_t len, char *obj_str);

/*-------------cputime()---------------------------
 * Record the current time in unit of 1/1000 second */
int cputime();

/*------------is_member-------------
 * check whether there is an item in mm has key and value equal to k and val
 * Pre	mm is a multimap object
 * 	k to be compared with key 
 * 	val to be compared with value
 * Post	return true if such an entry is found
 * 	otherwise return false*/
bool is_member(multimap<int, int> mm, int k, int val);

/*-------------Transition table for DFA-----------------------
 * The purpose of this structure is for DFA simulation. Since no STL
 * container related operation is involved, the performance has been
 * greatly improved as opposed to the method using STL classes, i.e.,
 * function accept_or_not_stream(). 
 * */
typedef struct dfa_trans_t {

  /* fill out transition tabl
   * Pre	nfa is a object of multimap_nfa
   * Post	trans_tab has been filled out */
  bool fill_trans_tab(multimap_nfa& nfa);
  void fill_trans_tab(char *f_dfa);
  /* perform DFA simulation
   * Pre	buf is a stream of input symbols 
   * 		len denotes the length of stream
   * Post	the matched signature id's and offsets in the stream
   * 		are reported */
  void simulate(const unsigned char *buf, unsigned int len) const;
 
  unsigned int num_states;
  unsigned int start;
  /*-------trans_tab is a matrix----------
   * row - state no
   * column - ASCII of symbols
   * an element n at (r, s) denotes that the machine will transit from
   * state r to n with input symbol s */
  unsigned int **trans_tab;
  /* keep the signature id */
  int ** acc;

} dfa_trans_t;

/*---------------------DIFFERENT IMPLEMENTATIONS OF NFA------------------
 * - All of the implementations below assume that epsilon transitions 
 * have been eliminated in the transition table.
 * - Class multimap_nfa deals with NFA with epsilon transitions 
 * - The four different implementations consume different amount of memory
 *   and provide different execution performance
 * --------------------------------------------------------------------*/
typedef struct simul_thread_data {
  int b_first;	// set to 1 if the data is for the first thread
  unsigned char *payload;
  int len;
  set<unsigned int> *starts;
} simul_thread_data;

/* ----- nfa_trans_set_t ----- */
typedef struct nfa_trans_set_t {
  /* filled out transition table trans_tab using a multimap_nfa object */
  bool fill_trans_tab(multimap_nfa& nfa);
  void fill_trans_tab(char *f_nfa);
  /* performs NFA simulation using transition table trans_tab,
   * which is memory-consuming than ptrans_tab */
  void simulate(const unsigned char *buf, unsigned int len) const;

  /* fill out the ptrans_tab */
  void fill_ptrans_tab(char *f_nfa);
  /* simulatep() is more efficient due to the use of null pointers
   * for empty sets. It consumes less memory than simulate() */
  void simulatep(const unsigned char *buf, unsigned int len) const;
  /* for multi-thread simulation */
  void *mt_simulate(simul_thread_data *payload_and_starts);

  unsigned int num_states;
  set<unsigned int> starts; 
  set<unsigned int> **trans_tab;
  set<unsigned int> ***ptrans_tab;
  /* keep the signature id */
  int **acc_ids;

} nfa_trans_set_t;

/* nfa_trans_mm_t use multimap to represent transition table
 * consumes less memory than nfa_trans_t */
typedef struct nfa_trans_mm_t {

  unsigned int num_states;
  set<unsigned int> starts; 
  multimap<unsigned char, unsigned int> *trans_tab;
  /* keep the signature id */
  unsigned int **acc_ids;

  void delta(unsigned int state, unsigned char sym, set<unsigned int>& s_next);
  void fill_trans_tab(char *f_nfa);
  void simulate(const unsigned char *buf, unsigned int len);

} nfa_trans_mm_t;

/* nfa_map_t uses map method. Each state and its transitions are
 * denoted by a map (sym, set_of_next_states) */
typedef struct nfa_map_t {

  unsigned int num_states;
  unsigned int num_transitions;
  unsigned int num_rdt_trans;
  set<unsigned int> starts;
  set<unsigned int> accepting_states;
  multimap<unsigned int, unsigned int> accept_ids;
  map<unsigned char, set<unsigned int> > *trans_tab;
  /* keep the signature id */
  unsigned int **acc_ids;

  /* for alphabet reduction */
  set< set<unsigned int> > target_sets;
  unsigned char class_c[ALPHABET_SIZE];
  map<unsigned char, set<unsigned int> > *reduced_trans_tab;
  

  void fill_trans_tab(char *f_trans);
  void simulate(const unsigned char *buf, unsigned int len);

  /*-------------function for alphabet reduction*/
  void alphabet_reduction();
  void alphabet_reduction_im();
  /* generates transition table with reduced alphabet */
  void rdt_trans_tab_gen();
  void dump_rdt_transitions(char *f);
  void rdt_simulate(const unsigned char *buf, unsigned int len);
  
} nfa_map_t;

/* to be used in multibyte match */
typedef struct validation_info {
  unsigned int acc_st;
  unsigned int src_st;
  unsigned int offset;
} validation_info;

typedef struct simul_thread_data2 {
  unsigned char *payload;
  int len;
  //multimap<unsigned int, unsigned int> *starts;
  set<pair<unsigned int, unsigned int> > *starts;
} simul_thread_data2;
