import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.transforms import Join
from pyspark.sql.functions import col,year,month,dayofmonth,to_date,from_unixtime, when
import datetime
import boto3
import hashlib, uuid
import sys
from awsglue.utils import getResolvedOptions
from pyspark.sql import SparkSession
from awsglue.dynamicframe import DynamicFrame
from awsglue.utils import getResolvedOptions
import json


salted_string = None
masked_entities = encrypted_entities = []
my_region = None
def get_masked_entities():
    """
    return a list of entities to be masked.
    If unstructured, use comprehend to determine entities to be masked
    """
    global masked_entities
    masked_entities = ["DOB", "FirstName", "LastName", "Address1", "Address2"]
    
def get_region_name():
    global my_region
    my_session = boto3.session.Session()
    my_region = my_session.region_name

def get_encrypted_entities():
    """
    return a list of entities to be masked.
    """
    global encrypted_entities
    encrypted_entities = ["DOB", "FirstName", "LastName", "Address1", "Address2"]
    

def detect_sensitive_info(r):
    """
    return a tuple after masking is complete.
    If unstructured, use comprehend to determine entities to be masked
    """ 
    
    metadata = r['AboutYourself']
    try:
        for entity in masked_entities:
            entity_masked = entity + "_masked"
            r[entity_masked] = "#######################"
    except:
        print ("DEBUG:",sys.exc_info())

    client_pii = boto3.client('comprehend', region_name=my_region)
    
    try:
        response = client_pii.detect_pii_entities(
            Text = metadata,
            LanguageCode = 'en'
        )
        clean_text = metadata
        # reversed to not modify the offsets of other entities when substituting
        for NER in reversed(response['Entities']):
            clean_text = clean_text[:NER['BeginOffset']] + NER['Type'] + clean_text[NER['EndOffset']:]
        print(clean_text)
        r['AboutYourself_masked'] = clean_text
    except:

        print ("DEBUG:",sys.exc_info())
    
    return r
    

def encrypt_rows(r):
    """
    return tuple with encrypted string
    Hardcoding salted string. PLease feel free to use SSM and KMS.
    """
    print ("encrypt_rows", salted_string, encrypted_entities)
    try:
        for entity in encrypted_entities:
            salted_entity = r[entity] + salted_string
            hashkey = hashlib.sha3_256(salted_entity.encode()).hexdigest()
            
            r[entity + '_encrypted'] = hashkey
    except:
        print ("DEBUG:",sys.exc_info())
    return r


def set_salted_string(s):
    """
    return salted string
    """    
    global salted_string
    salted_string=s

## @params: [JOB_NAME]
 
accountid = boto3.client('sts').get_caller_identity().get('Account')
args = getResolvedOptions(sys.argv,
                          ['JOB_NAME',
                           'salted_string',
                           'encrypted_bucket',
                           'curated_bucket',
                           'raw_table_name',
                           'patient_table_name',
                           'masked_bucket'])

salted_string=args['salted_string']
encrypted_bucket=args['encrypted_bucket']
masked_bucket=args['masked_bucket']
curated_bucket=args['curated_bucket']
patient_metrics_table=args['raw_table_name']
patient_metadata_table=args['patient_table_name']

# Intiailize s3 path
curated_path = "s3://" + curated_bucket + "/secure-dl-curated-data"
masked_path = "s3://" + masked_bucket + "/secure-dl-masked-data"
encrypted_path = "s3://" + masked_bucket + "/secure-dl-encrypted-data"

# Get entities to be masked
set_salted_string(salted_string)
get_masked_entities()
get_encrypted_entities()
get_region_name()

now = datetime.datetime.now()
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
patient_metadata = glueContext.create_dynamic_frame.from_catalog(database = "secure-db", table_name = patient_metadata_table, transformation_ctx = "patient_metadata")
patient_metrics = glueContext.create_dynamic_frame.from_catalog(database = "secure-db", table_name = patient_metrics_table, transformation_ctx = "patient_metrics")
patient_metadata.printSchema()
patient_metadata.toDF().show(1)
patient_metrics.printSchema()
patient_metrics.toDF().show(1)

# Join Patient metadata and patient metrics dataframe
combined_df=Join.apply(patient_metadata, patient_metrics, 'PatientId', 'pid', transformation_ctx = "combined_df")
combined_df.printSchema()
combined_df.toDF().show(1)

# Apply groupby to get unique  AboutYourself records
group=combined_df.toDF().groupBy("pid","DOB", "FirstName", "LastName", "Address1", "Address2", "AboutYourself").count()
group.show()

# Convert group to DynamicFrame
group_df=DynamicFrame.fromDF(group, glueContext, "group_df")

# Apply lookup_pii to get the redacted string after masking  PII data
df_with_about_yourself = Map.apply(frame = group_df, f = detect_sensitive_info)
df_with_about_yourself.show()

# Apply encryption to the identified fields
df_with_about_yourself_encrypted = Map.apply(frame = group_df, f = encrypt_rows)

# Define join_by column and do outer join
join_by_col = 'pid'
combined_df_x = combined_df.toDF()

# convert to dataframe
df_with_about_yourself_x = df_with_about_yourself.toDF()
# drop duplicate columns
df_with_about_yourself_x = df_with_about_yourself_x.drop("DOB", "FirstName", "LastName", "Address1", "Address2", "AboutYourself")

# convert to dataframe
df_with_about_yourself_encrypted_x = df_with_about_yourself_encrypted.toDF()
# drop duplicate columns
df_with_about_yourself_encrypted_x = df_with_about_yourself_encrypted_x.drop("DOB", "FirstName", "LastName", "Address1", "Address2", "AboutYourself")

# join with masked data
combined_df_masked = df_with_about_yourself_x.join(combined_df_x, on = [join_by_col], how = 'outer').orderBy(join_by_col)
combined_df_masked = combined_df_masked.drop("DOB", "FirstName", "LastName", "Address1", "Address2", "AboutYourself")
combined_df_masked_x = DynamicFrame.fromDF(combined_df_masked, glueContext, "combined_df_masked_x")
combined_df_masked.show(1)

# join with encrypted data
combined_df_encrypted = df_with_about_yourself_encrypted_x.join(combined_df_x, on = [join_by_col], how = 'outer').orderBy(join_by_col)
combined_df_encrypted = combined_df_encrypted.drop("DOB", "FirstName", "LastName", "Address1", "Address2", "AboutYourself")
combined_df_encrypted_x = DynamicFrame.fromDF(combined_df_encrypted, glueContext, "combined_df_encrypted_x")
combined_df_encrypted.show(1)

# output to s3 in parquet format
datasink5 = glueContext.write_dynamic_frame.from_options(frame = combined_df, connection_type = "s3", connection_options = {"path": curated_path, "partitionKeys": ["year","month", "day", "hour","pid"]}, format = "parquet", transformation_ctx = "datasink5")
datasink6 = glueContext.write_dynamic_frame.from_options(frame = combined_df_masked_x, connection_type = "s3", connection_options = {"path": masked_path, "partitionKeys": ["year","month", "day", "hour","pid"]}, format = "parquet", transformation_ctx = "datasink6")
datasink7 = glueContext.write_dynamic_frame.from_options(frame = combined_df_encrypted_x, connection_type = "s3", connection_options = {"path": encrypted_path, "partitionKeys": ["year","month", "day", "hour","pid"]}, format = "parquet", transformation_ctx = "datasink7")

job.commit()