from __future__ import division
import sys
import fractions
import math

# Function: 
# input: a pair of (BB, BB)
# output: index in the array node_list (node number of the variable in G) corresponding to the pair (BB, BB)

def getNodeNum(bb1, bb2):

# go to k'th entry in array 
   k = 0
   while (k < len(node_list)):
        if ((int(node_list[k][0]) == bb1) and (int(node_list[k][1]) == bb2)):
# Array index starts from 0. So, if 0th entry is the match, what you should return instead is 1, since in matlab, we start counting nodes from 1.
# So in general, return k+1
                return k+1
        k += 1
   return -1

# Function to compute how "similar" the two counts are.

def computeSimilarity(c1, c2):

    if (c1 < c2):
        metric = round(float(c1/c2), 2)
    else:
        metric = round(float(c2/c1), 2)
    return metric

def computeDifferenceAbsCounts(c1, c2):
    metric = math.fabs(c1-c2)
    return int(metric)

def computeDifferenceDigramCounts(c1, c2):
    metric = math.fabs(c1-c2)
    return metric

# Given : j2me entry from the j2me_list, android entry from the android_list
# Output : true if (j2me BB, android BB) are similar
#	   false otherwise	  
# This function compares relative counts. 

def areSimilarlyHigh(j2me_entry, android_entry):
    j2me_count = j2me_entry
    android_count = android_entry
    if (computeSimilarity(j2me_count, android_count) > 0.6):
 	return True
    else:
	return False

# This function compares absolute counts. Figured out that when absolute counts are low, you should be comparing absolute counts, rather than 
# relative counts. For ex. if rel_count1 = 5/1296, rel_count2 = 9/876, if you take rel_count1/rel_count2, this is more of a comparison between
# the lengths of the traces, rather than between the counts. So, what you should be doing is 5/9. But 5/9 does not correctly represent how similar the 
# counts are. Rather, if you take a difference, (9-5), then it is a better measure. But what if tomorrow's traces have different low counts? 

def areSimilarlyLow(j2me_entry, android_entry):

# Absolute counts are at index 2 in the android_list and j2me_list.
    j2me_count = j2me_entry[2]
    android_count = android_entry[2]
    if ((computeDifferenceAbsCounts(j2me_count, android_count) <= 10) or (computeSimilarity(j2me_count, android_count) >= 0.5)):
        return True
    else:
        return False    


# Given : entry from the list
# Output : true if (BB count) is high
#	   false otherwise	  

def isHigh(entry):
    relcount_threshold = 0.1
    count = entry[1]
    if (float(count) > relcount_threshold):
 	return True
    else:
	return False

# Given : A, alpha, B, beta, trace
# Output: projected trace having only A, alpha, B, and beta.

def projectedTrace(traceFile, A, B):
    projTrace = "" 
    for line in open(traceFile,'r'):
    	line = line.rstrip('\n')
        if ((line == str(A)) or (line == str(B))):
            projTrace += line
            projTrace += "\n"

    return projTrace   
  
# Given : A, alpha, B, beta, trace
# Output :  projected trace having A, alpha, B, beta and E(Epsilon).

def projectedTraceWithEpsilon(traceFile, A, B):
    projTrace = ""
    for line in open(traceFile,'r'):
    	line = line.rstrip('\n')
        if ((line == str(A)) or (line == str(B))):
            projTrace += line
            projTrace += "\n"
	else:
	    projTrace += "-1"
	    projTrace += "\n"

    return projTrace


# Given : Trace, pattern
# Output : Count of how many times the given pattern appears in a sliding window in the trace

def computePatternCount(trace, pattern):
    count = 0
    searchIndex = 0
    while (searchIndex < len(trace)):    
	index = trace.find(pattern, searchIndex)
	if (index != -1):
	    count += 1 
	    searchIndex = index + 1
	else:
	    break
    return count

# Given : string1, string2
# Output : All possible twos combinations of (string1, string2).

def allTwosWithEpsilon(str1, str2):
    twos = []
    twos.append(str1 + "\n" + str2)
#    if (str1 != str2):
    twos.append(str2 + "\n" + str1)
#    twos.append(str1 + "\n" + str1)
#    twos.append(str2 + "\n" + str2)
#    twos.append(str1 + "\n" + "-1")
#    twos.append(str2 + "\n" + "-1")
#    twos.append("-1" + "\n" + str1)
#    twos.append("-1" + "\n" + str2)
    return twos

def allTwos(str1, str2):
    twos = []
    twos.append(str1 + "\n" + str2)
#    if (str1 != str2):
    twos.append(str2 + "\n" + str1)
#    twos.append(str1 + "\n" + str1)
#    twos.append(str2 + "\n" + str2)
    return twos

# the input argument "score" is an array with three entries: 
# score[0] = # match. 
# score[1] = # (match since both entries were zero). 
# score[2] = # mismatch.

def scoreSimilarCounts(jentry, aentry, score):
    str = ""
    abs_jcount = jentry[1]
    abs_acount = aentry[1]

    rel_jcount = jentry[2]
    rel_acount = aentry[2]

    if ((abs_jcount == 0) and (abs_acount == 0)):
	score[1] += 1
	str = "match 00"
# Why am I using a different function to calculate similarity of pattern counts here? 
# If the counts are low, you should use computeDifferenceAbsCounts() 
# Else, you should use computeSimilarity(). So your code should be (you need both, relative and absolute counts passed to this function, btw).
# if (isHigh(j2me pattern relative count) and isHigh(android pattern relative count))) 
#     compare relative counts
# else (i.e. both, j2me pattern and android pattern have low relative counts
#     compare absolute counts
#    elif ( ( (isHigh(jentry) or isHigh(aentry)) and ( computeSimilarity(rel_jcount, rel_acount) > 0.6 ) ) or ( not(isHigh(jentry)) and not(isHigh(aentry)) and ( computeDifferenceAbsCounts(abs_jcount, abs_acount) <= 10 ) ) ):
 
    elif ( (computeDifferenceDigramCounts(rel_jcount, rel_acount) < 0.1) ):
	score[0] += 1
	str = "match"
    else:
	score[2] += 1
	str = "mis-match"
    return str

def computeScore(prjJ2meTrace, prjAndroidTrace, A, B, alpha, beta, epsilon, score):
    if (epsilon == 1):
    	jTwos = allTwosWithEpsilon(str(A), str(B))
    	aTwos = allTwosWithEpsilon(str(alpha), str(beta))
    elif (epsilon == 0):
	jTwos = allTwos(str(A), str(B))
        aTwos = allTwos(str(alpha), str(beta))
    else:
	print "Invalid epsilon value"
	sys.exit()
    jlen = prjJ2meTrace.count("\n")
    alen = prjAndroidTrace.count("\n")
#    print "j2me Trace:"
#    print prjJ2meTrace
#    print "android Trace"
#    print prjAndroidTrace
    i = 0
    while (i < len(jTwos)):
	jc = computePatternCount(prjJ2meTrace, jTwos[i])
	ac = computePatternCount(prjAndroidTrace, aTwos[i])
        print jTwos[i], "->", aTwos[i], float(jc /jlen), "::", float(ac/alen), "(", jc, "/", jlen, "::", ac, "/", alen, ")", scoreSimilarCounts( [ -1, jc, jc/jlen ], [-1, ac, ac/alen] , score )
	i += 1
    print "Score: ", score
    return score

def writeMatCodeFiles(A, B, alpha, beta, fNumRef, cpd):
    node1 = getNodeNum(A, alpha)
    node2 = getNodeNum(B, beta)

    fNumRef[0] += 1
    s = "\t\ttabular_CPD(bnet, fc78, 'CPT', [" + cpd + "]), ...\n"
    print s
    codeFileCPD.write(s)
    s = "G(" + str(node1) + "," + str(fNumRef[0]) + ") = 1;\n"
    print s
    codeFileNodes.write(s)
    s = "G(" + str(node2) + "," + str(fNumRef[0]) + ") = 1;\n"
    print s
    codeFileNodes.write(s)

def writeJointFactorMatCodeFiles(A, B, alpha, beta, fNumRef, cpd):
    node1 = getNodeNum(A, alpha)
    node2 = getNodeNum(A, beta)
    node3 = getNodeNum(B, alpha)
    node4 = getNodeNum(B, beta)

    fNumRef[0] += 1
    s = "\t\ttabular_CPD(bnet, fj4578, 'CPT', [" + cpd + "]), ...\n"
    print s
    codeFileCPD.write(s)
    s = "G(" + str(node1) + "," + str(fNumRef[0]) + ") = 1;\n"
    print s
    codeFileNodes.write(s)
    s = "G(" + str(node2) + "," + str(fNumRef[0]) + ") = 1;\n"
    print s
    codeFileNodes.write(s)
    s = "G(" + str(node3) + "," + str(fNumRef[0]) + ") = 1;\n"
    print s
    codeFileNodes.write(s)
    s = "G(" + str(node4) + "," + str(fNumRef[0]) + ") = 1;\n"
    print s
    codeFileNodes.write(s)

# error checking
if (len(sys.argv) != 8) :
    print "# Usage: python digrams_factor_CPD_r.py <android list file> <j2me list file> <android tracefile in terms of BB numbers> <j2me tracefile in terms of BB numbers> <code_file_nodes> <code_file_cpd> <code_file_num> "
    sys.exit()

# start
androidList = []
j2meList = []
node_list = []

# androidList reads from android list
for line in open(sys.argv[1],'r'):
   values = line.split()
   androidList.append( [ int(values[0]), fractions.Fraction(values[1]), int(values[2]), 0 ] )

# androidList has format: BB number, relative count, absolute count, flag which when turned ON would indicate that BB is frequent

# j2meList reads from j2me list
for line in open(sys.argv[2],'r'):
   values = line.split()
   j2meList.append( [ int(values[0]), fractions.Fraction(values[1]), int(values[2]), 0 ] )


# Create local node list
for jline in open(sys.argv[2],'r'):
    jvalues = jline.split()
    for aline in open(sys.argv[1],'r'):
        avalues = aline.split()
        node_list.append([jvalues[0], avalues[0]])

# I changed the trueCPD to have less weightage than false CPD. But that didn't make much effect. It only reduced the perfect 1.0 values to something like 0.999455. 
# Somehow, almost all true values are very high, like 0.999***.

# initialize output list
cpd = []
trueAndCPD_h = "0.4 0.4 0.4 0.6"
falseAndCPD_h = "0.6 0.6 0.6 0.4"
trueAndCPD_l = "0.4 0.4 0.4 0.6"
falseAndCPD_l = "0.6 0.6 0.6 0.4"

falseCPD_joint = "0.99 0.99 0.99 0.99 0.99 0.99 0.99 0.99 0.99 0.99 0.99 0.99 0.99 0.99 0.01" 

# Output file in which nodes are written: G(34, 56) = 1
codeFileNodes = open(sys.argv[5], 'a+');

# Output file in which tabular_CPDs for the "ordering factor" are written:
#                 tabular_CPD(bnet, fc78, 'CPT', [1 1 1 0]), ...
codeFileCPD = open(sys.argv[6], 'a+');

nfactors = 0
nvars = 0

# You need to know the current factor number
for line in open(sys.argv[7],'r'):
   values = line.split()
   if ((values[0] == "nfactors") and (values[1] == "=")):
        nfactors = values[2]
        nfactors = nfactors.rstrip(';')
        nfactors = int(nfactors)
   if ((values[0] == "nvars") and (values[1] == "=")):
        nvars = values[2]
        nvars = nvars.rstrip(';')
        nvars = int(nvars)

factorNumRef = [nfactors]

i = 0
j = 0
x = 0
y = 0
score = []
score.append(0)
score.append(0)
score.append(0)
while (i < len (j2meList)):
    j = 0
    while (j < len(androidList)):
	A = j2meList[i][0]
	alpha = androidList[j][0]
	x = i + 1
	while (x < len(j2meList)):
	    y = 0 
	    while (y < len(androidList)):
		if (y == j):
		    y += 1
		    continue
		B = j2meList[x][0]
		beta = androidList[y][0]
		score[0] = 0
		score[1] = 0
		score[2] = 0

# By choosing x = i+1 and checking for y == j, you are explicitly imposing that B != A and beta != alpha. 

# Case a: relcount(A) = high && relcount(alpha) = high &&
#	  relcount(B) = high && relcount(alpha) = high
		if ( (isHigh(j2meList[i])) and (isHigh(androidList[j])) and (areSimilarlyHigh(j2meList[i][1], androidList[j][1])) and                                        (isHigh(j2meList[x])) and (isHigh(androidList[y])) and (areSimilarlyHigh(j2meList[x][1], androidList[y][1])) ):
		    print "High High (", A, alpha, ")", "(", B, beta, ")"
#		    prjJ2meTrace = projectedTraceWithEpsilon(sys.argv[4], A, B) 
#		    prjAndroidTrace = projectedTraceWithEpsilon(sys.argv[3], alpha, beta)
		    prjJ2meTrace = projectedTrace(sys.argv[4], A, B) 
		    prjAndroidTrace = projectedTrace(sys.argv[3], alpha, beta)
		    score = computeScore(prjJ2meTrace, prjAndroidTrace, A, B, alpha, beta, 1, score)
# If I have only a single condition : score[0] == 0, then it gives lots of false positives. Reason is: Many true mappings have score[0] = 1, and so the single condition misses those true mappings.
		    if ((score[2] == 0 or score[2] == 1) and (score[0] >= score[1]) ):
#		    if ((-1*score[0] + 1*score[1] + 2*score[2]) >= 10):
			cpd = trueAndCPD_h 
		    else:

#		    if (not((score[2] == 0 or score[2] == 1) and (score[0] >= score[1])) ):
			cpd = falseAndCPD_h
                    writeMatCodeFiles(A, B, alpha, beta, factorNumRef, cpd)	

# Case c: relcount(A) = low && relcount(alpha) = low && relcount(A) ~ relcount(alpha) &&
#	  relcount(B) = high && relcount(alpha) = high relcount(B) ~ relcount(beta) OR
#	  relcount(A) = high && relcount(alpha) = high && relcount(A) ~ relcount(alpha)  &&
#	  relcount(B) = low && relcount(alpha) = low && relcount(B) ~ relcount(beta)
                if ( (not(isHigh(j2meList[i])) and not(isHigh(androidList[j])) and (areSimilarlyLow(j2meList[i], androidList[j])) and                                        (isHigh(j2meList[x])) and (isHigh(androidList[y])) and (areSimilarlyHigh(j2meList[x][1], androidList[y][1])) ) or ((isHigh(j2meList[i])) and (isHigh(androidList[j])) and (areSimilarlyHigh(j2meList[i][1], androidList[j][1])) and                                        not(isHigh(j2meList[x])) and not(isHigh(androidList[y])) and (areSimilarlyLow(j2meList[x], androidList[y])) ) ):
                    print "High Low (", A, alpha, ")", "(", B, beta, ")"
                    prjJ2meTrace = projectedTrace(sys.argv[4], A, B)
                    prjAndroidTrace = projectedTrace(sys.argv[3], alpha, beta)
                    score = computeScore(prjJ2meTrace, prjAndroidTrace, A, B, alpha, beta, 1, score)
                    if ((score[2] == 0) and (score[0] >= score[1])):
                        cpd = trueAndCPD_l 
                    else:
                        cpd = falseAndCPD_l
                    writeMatCodeFiles(A, B, alpha, beta, factorNumRef, cpd)

		y += 1
	    x += 1
	j += 1
    i += 1

codeFileNodes.close()
codeFileCPD.close()

# Output file in which only two lines are written:
#               nfactors = 331;
#               nvars = 325;
codeFileNum = open(sys.argv[7], 'w');

s = "nfactors = " + str(factorNumRef[0]) + ";\n"
codeFileNum.write(s)
print s
s = "nvars = " + str(nvars) + ";\n"
codeFileNum.write(s)
print s
codeFileNum.close()


