#!/usr/bin/python
"""Generate, sanitize and parse surveys."""
import os, csv, json, pandas, tempfile, utils, re

BLANKS = ["", "Response", "Open-Ended Response"]

# TODO Don't crash when user inputs wrong things

def get_survey(name):
	"""Gets survey dataframe from .csv while dropping hidden columns."""
	base = os.path.join("db", "surveys", name)
	survey = pandas.read_csv(os.path.join(base, "survey.csv"), index_col=False)
	survey = survey.fillna("")
	properties = utils.safejsonload(os.path.join(base, "properties.json"))

	return survey, properties

def get_metadata(name):
	path = os.path.join("db", "surveys", name, "metadata.json")
	metadata = utils.safejsonload(path)
	return metadata

def get_readings(name):
	path = os.path.join("db", "surveys", name, "readings.json")
	readings = utils.safejsonload(path)
	return readings

def filter_survey(survey, properties, metadata, selected_answers):
	"""Filter survey given desired answers for each question."""
	# Filter survey by only matching rows with with valid answer for each question
	# When checking a column touchpoint, select the rows where the answer in question
	# contains every selected answer
	for column in selected_answers:
		if column in metadata["touchpoint-columns"]:
			all_answers_regex = "(?=.*" + ")(?=.*".join(selected_answers[column]) + ")"
			survey = survey.loc[survey[column].str.contains(all_answers_regex)]
		else:
			survey = survey.loc[survey[column].isin(selected_answers[column])]

	# Drop entire dataframe if not enough matches
	if len(survey.index) < properties["anon-matches"]:
		survey.drop(survey.index, inplace=True)

	return survey

def sanitize(string):
	string = re.sub(" +", " ", string)
	string = string.strip()
	return string

def join_not_null(x):
	return ";".join(filter(None, x[x.notnull()].astype(str)))

def parse_file(source, touchpoints = [], deleted = []):
	"""Parses and returns the survey as a pandas dataframe along with some
	metadata."""
	# Remove deleted from touchpoints
	touchpoints = [ c for c in touchpoints if c not in deleted ]

	survey = []
	with open(source, "r", newline="", encoding="utf-8") as raw_file:
		raw = csv.reader(raw_file, delimiter=",", quotechar='"')
		survey = list(raw)

	# HACK This whole for maybe should just be pandas...
	categories = {}
	for column_i in range(len(survey[0])):
		survey[0][column_i] = sanitize(survey[0][column_i]) # Category
		survey[1][column_i] = sanitize(survey[1][column_i]) # Question

		prev_category = None if column_i == 0 else survey[0][column_i - 1]

		# Expand categories to empty neighbours
		if survey[0][column_i] == "":
			survey[0][column_i] = prev_category

		# Replace empty questions with their category
		if survey[1][column_i] in BLANKS:
			survey[1][column_i] = survey[0][column_i]

		# Populate categories dictionary
		if survey[0][column_i] == prev_category:
			categories[survey[0][column_i]].append(survey[1][column_i])
		else:
			categories[survey[0][column_i]] = [survey[1][column_i]]

		# Touchpoint questions, which in raw are split into each possible touchpoint,
		# are renamed to their category
		if survey[0][column_i] in touchpoints:
			survey[1][column_i] = survey[0][column_i]

	survey = survey[1:]

	# FIXME Remove empty rows, judged by ratio of blank to non-blank cells
	#for row in reversed(survey):
		#choice_questions = len(categories[choices[0]]) if choices else 0
		#if (row.count("") - choice_questions) / len(row) > blank_row_ratio:
			#survey.remove(row)

	# Remove categories row and get dataframe from list of lists
	headers = survey.pop(0)
	survey_df = pandas.DataFrame(survey, columns=headers)

	survey_df = survey_df.dropna(how="all", axis=1)

	# Deleted columns
	survey_df = survey_df.drop(deleted, errors="ignore", axis=1)
	for column in deleted:
		categories.pop(column)

	# "Touchpoint" type questions are split into as many columns as possible answers
	# Merge all questions from "touchpoints" categories into single columns
	# Comma-separate answers and label the new merged column as category name
	for category in touchpoints:
		merged = survey_df[category].groupby(level=0, axis=1).apply(lambda x: x.apply(join_not_null, axis=1))
		survey_df = survey_df.drop([category], axis=1)
		survey_df[category] = merged

	# Get unique answers for each question
	answers = {}
	for i, column in enumerate(survey_df):
		if column not in touchpoints:
			answers[column] = survey_df.iloc[:, i].unique().tolist()

	# Get unique answers for touchpoint-type questions ignored in previous for
	for touchpoint in touchpoints:
		answers[touchpoint] = categories[touchpoint]
		categories[touchpoint] = [touchpoint]

	metadata = {
		"categories": categories,
		"answers": answers,
		"touchpoint-columns": touchpoints,
	}

	return survey_df, metadata

def write(survey, destination, metadata):
	"""Writes parsed surveys to destination."""
	os.makedirs(destination, exist_ok=True)

	survey.to_csv(os.path.join(destination, "survey.csv"), index=False, mode="w+")

	path = os.path.join(destination, "metadata.json")
	utils.safejsondump(metadata, path)

	return 0
