# Install and Import Dependent modules

In [None]:
!pip install tlslite

In [None]:
!pip install oauth2

In [None]:
import urllib
import oauth2 as oauth
from tlslite.utils import keyfactory
import json
import sys
import os
import base64
import boto3
from boto3.dynamodb.conditions import Key, Attr
import datetime
import logging
import pprint
import time
from pytz import timezone

logger = logging.getLogger()
logger.setLevel(logging.INFO)

#### Set-up ssm Parameters as shown in blogpost and proceed to next section (below are the ssm parameters)
- jira_access_urls: Parameter to store URLs to access JIRA via RestAPI
- jira_access_secrets: Parameter to store Secrets to access JIRA
- jira_access_private_key: Parameter to store Private Keycorresponding to public key specified in Jira RestAPI configuration.

In [None]:
ssm = boto3.client("ssm", region_name='us-east-1')

# Perform 3-Legged OAuth Process Referred to as “OAuth dance”

#### Define the Siganture Class to sign JIRA RESTApi requests

In [None]:
class SignatureMethod_RSA_SHA1(oauth.SignatureMethod):
    name = 'RSA-SHA1'

    def signing_base(self, request, consumer, token):
        if not hasattr(request, 'normalized_url') or request.normalized_url is None:
            raise ValueError("Base URL for request is not set.")

        sig = (
            oauth.escape(request.method),
            oauth.escape(request.normalized_url),
            oauth.escape(request.get_normalized_parameters()),
        )

        key = '%s&' % oauth.escape(consumer.secret)
        if token:
            key += oauth.escape(token.secret)
        raw = '&'.join(sig)
        return key, raw

    def sign(self, request, consumer, token):

        key, raw = self.signing_base(request, consumer, token)

        # SSM support to fetch private key
        ssm_param = ssm.get_parameter(Name='jira_access_private_key', WithDecryption=True)
        jira_private_key_str = ssm_param['Parameter']['Value']

        privateKeyString = jira_private_key_str.strip()

        privatekey = keyfactory.parsePrivateKey(privateKeyString)

        # Used encode() to convert to bytes
        signature = privatekey.hashAndSign(raw.encode())
        return base64.b64encode(signature)

#### Get consume_key & consumer_secret from ssm Parameter these were defined in JIRA portal

In [None]:
jira_secrets = json.loads(ssm.get_parameter(Name='jira_access_secrets', WithDecryption=True)['Parameter']['Value'])

In [None]:
jira_secrets

In [None]:
consumer_key = jira_secrets["consumer_key"]
consumer_key

In [None]:
consumer_secret = jira_secrets["consumer_secret"]
consumer_secret

#### Define URLs to be used in 3 Legged OAuth Process
- These URLs are defined while setting-up RestAPI endpoint in jira
- Here are sample URLs to show the how it is formed and it's components
    - request_token_url = 'https://jiratoawss3.atlassian.net/plugins/servlet/oauth/request-token'
    - access_token_url = 'https://jiratoawss3.atlassian.net/plugins/servlet/oauth/access-token'
    - authorize_url = 'https://jiratoawss3.atlassian.net/plugins/servlet/oauth/authorize'

In [None]:
request_token_url = 'input_here'
access_token_url = 'input_here'
authorize_url = 'input_here'

#### Step-1 of 3-Legged OAuth Process
- Generate Request Token

In [None]:
# Create Consumer using consumer_key and consumer_secret
consumer = oauth.Consumer(consumer_key, consumer_secret)

In [None]:
# Use Consumer to create oauth client
client = oauth.Client(consumer)

In [None]:
# Add Signature Method to the client
client.set_signature_method(SignatureMethod_RSA_SHA1())

In [None]:
# Get response from request token URL using the client
resp, content = client.request(request_token_url, "POST")

In [None]:
# Convert the content received from previous step into a Dictionary
request_token = dict(urllib.parse.parse_qsl(content))

In [None]:
# request token has two components oauth_token and oauth_token_secret
request_token

#### Step-2 of 3 Legged OAuth Process
- Manually Approve the Request Token by opening below URL in a Browser
- Approve the request by opening the below user in a browser
- Example Value of final autorize user is:
- https://jiratoawss3.atlassian.net/plugins/servlet/oauth/authorize?oauth_token=wYLlIxmcsnZTHgTy2ZpUmBakqzmqSbww

In [None]:
authorize_url + '?oauth_token=' + request_token[b'oauth_token'].decode()

#### Step-3 of 3 Legged OAuth Process
- Use Approved Request Token to generate Access Token

In [None]:
# Create an oauth token using components of request token
token = oauth.Token(request_token[b'oauth_token'], request_token[b'oauth_token_secret'])

In [None]:
# Use Consumer and token to create oauth client
client = oauth.Client(consumer, token)

In [None]:
# Add Signature Method to the client
client.set_signature_method(SignatureMethod_RSA_SHA1())

In [None]:
# Get response from access token URL using the client
access_token_resp, access_token_content = client.request(access_token_url, "POST")

In [None]:
access_token_content

#### Update access_token key in SSM Parameter jira_access_secrets with value of access_token_content and go to next step
- This Access token is valid for 5 years (expires_in key of access_token_content tells when token will expire in seconds)
- Rotation of Access Key depends on Organization's Security policy and is out of scope of this blogpost

## Test Access to Jira using Access Token

In [None]:
# Convert Access Token to Dictionary
access_token = dict(urllib.parse.parse_qsl(access_token_content))

In [None]:
# Display the value of access token
access_token

In [None]:
# Create an oauth token using components of access token
accessToken = oauth.Token(access_token[b'oauth_token'], access_token[b'oauth_token_secret'])

In [None]:
# Use Consumer and Access Token to create oauth client
client = oauth.Client(consumer, accessToken)

In [None]:
# Add Signature Method to the client
client.set_signature_method(SignatureMethod_RSA_SHA1())

#### data_url is also defined while setting-up RestAPI endpoint in jira
- Below is the sample value of data URL
    - https://awsjiratos3.atlassian.net/rest/api/2/search?jql=project=Test_Project_1
- Data URL has two components
    - RestAPI Endpoint to get data from JIRA
    - JQL (Jira Query Language) to filter data from JIRA example: by project, time frame etc.

In [None]:
# Define Data URL to pull data from JIRA mention test project with limited records
data_url = 'input_here'

In [None]:
# Pull Data for test project from JIRA
jira_resp, jira_content = client.request(data_url, "GET")

In [None]:
# jira_content will have data returned from JIRA in json format
pprint.pprint(jira_content)

# Sample Code to Pull Data From JIRA in to s3
- This process should be done as part of code deployment (Cloudformation or CDK)
- AWS Region should be parameterized 

#### Create get_data function to orchestrate data pull from JIRA and extract data to s3
- As part of Orchestration this function will identify number of records which will be extracted from JIRA
- Loops through them based on JIRA Page size
- Calls query_endpoint to get data for a page
- Writes data to s3

In [None]:
def get_data(project, load_type, start_date, end_date, s3_end_date_prefix):
    """
    :param
        project: JIRA Project for which data needs to be extracted to s3
        load_type: Bulk / Incremental
        start_date: Date as String in 'YYYY-MM-DD HH:00' format (Blank in if load_type is Bulk)
        end_date: Date as String in 'YYYY-MM-DD HH:00' format
        s3_end_date_prefix: End Date formatted as s3 Prefix

    :return:
        Total Number of Records Extracted From JIRA
    """
    
    # Define s3 client
    s3 = boto3.client('s3')

    # Define s3 bucket & key where JIRA Output will be written
    # these can be passed as Glue Job Parameter or Lambda Function Environment Variable
    # or stored as ssm parameter
    output_s3_bucket = 'sample-project-tracking-ingest'
    key_prefix = 'jira_data' + '/' + project + '/' + s3_end_date_prefix + '/'

    # Set values for JQL Query Variables
    start_page_index = 0
    start_at_index = 0
    result_size = 50
    max_pages = 100000

    # Define Output file name
    file_name = project + '_' + str(start_page_index) + '.json'

    # Define variables to create Consumer & Signature for accessing JIRA - jira_access_secrets
    jira_access_secrets = json.loads(ssm.get_parameter(Name='jira_access_secrets', WithDecryption=True)['Parameter']['Value'])
    consumer_key = jira_access_secrets['consumer_key']
    consumer_secret = jira_access_secrets['consumer_secret']

    oauth_token_param_str = jira_access_secrets['access_token']
    oauth_token = oauth_token_param_str.encode()

    # Define Base URL, Search End Point & JQL Query Template
    jira_access_urls = json.loads(ssm.get_parameter(Name='jira_access_urls', WithDecryption=True)['Parameter']['Value'])
    endpoint_url = jira_access_urls['data_url']

    if load_type == 'Bulk':
        query_string_template = "?jql=project={0}{1}&startAt={2}&maxResults={3}&fields=*all&expand=renderedFields,names,schema,transitions,operations,changelog,projects.issuetypes.fields&fieldsByKeys=true"
        date_condition = "&updated<='" + end_date + "'"
        escaped_date_condition = urllib.parse.quote(date_condition, safe='')
        # Replace variables in query string template
        query_string = query_string_template.format(project,
                                                    escaped_date_condition,
                                                    start_at_index,
                                                    result_size)
    else:
        query_string_template = "?jql=project={0}{1}{2}&startAt={3}&maxResults={4}&fields=*all&expand=renderedFields,names,schema,transitions,operations,changelog,projects.issuetypes.fields&fieldsByKeys=true"
        date_condition1 = "&updated>'" + start_date + "'"
        date_condition2 = "&updated<='" + end_date + "'"
        escaped_date_condition1 = urllib.parse.quote(date_condition1, safe='')
        escaped_date_condition2 = urllib.parse.quote(date_condition2, safe='')
        # Replace variables in query string template
        query_string = query_string_template.format(project,
                                                    escaped_date_condition1,
                                                    escaped_date_condition2,
                                                    start_at_index,
                                                    result_size)

    logger.info("Generating token consumer")

    # Generate consumer
    consumer = oauth.Consumer(consumer_key, consumer_secret)

    # Generate Access Token
    access_token = dict(urllib.parse.parse_qsl(oauth_token))

    # Generate Client Token from Access Token
    client_token = oauth.Token(access_token[b'oauth_token'], access_token[b'oauth_token_secret'])

    logger.info("Retrieving data: {0}".format(query_string))
    data = {}

    # Call query_endpoint function to hit JIRA API Endpoint
    try:
        data = query_endpoint(query_string, endpoint_url, consumer, client_token)
    except Exception as data_err:
        logger.error("Unable to retrieve data: {0}".format(str(data_err)))
        data = {}

    #
    num_entries = 0
    json_issues_list = None

    # Get Total Rows & Extract Data Records
    total_entries = data.get('total', 0)
    issues_list = data.get('issues', [])
    estimated_max_pages = (total_entries // result_size) + 1

    logger.info("Total number of issues: {0}. Initial page retrieved {1} issues.".format(total_entries,
                                                                                         len(issues_list)))

    # Parameter to sleep between two API calls
    api_sleep_in_seconds = 1

    # Sleep for api_sleep_in_seconds between two API calls to JIRA
    if total_entries > 0:
        logger.info("API sleep for {0} seconds.".format(api_sleep_in_seconds))
        time.sleep(api_sleep_in_seconds)

    # Put the Data to s3
    if len(issues_list) > 0:
        try:
            json_issues_list = json.dumps(issues_list)
        except Exception as json_err:
            logger.error("Unable to serialize issues_list to json: {0}".format(str(json_err)))
        if json_issues_list is not None and type(json_issues_list) is str:
            logger.info(
                "Attempting to write JSON to buket {0} and key {1}".format(output_s3_bucket, key_prefix + file_name))
            try:
                s3.put_object(Bucket=output_s3_bucket, Key=key_prefix + file_name, Body=json_issues_list)
            except Exception as e:
                logger.error("Unable to write JSON to buket {0} and key {1} because {2}".format(output_s3_bucket,
                                                                                                key_prefix + file_name,
                                                                                                str(e)))

    num_entries += len(issues_list)
    start_at_index += len(issues_list)
    start_page_index += 1

    # Paginate
    logger.info("Query has {0} total entries. Max pages set to {1}".format(total_entries, max_pages))

    while start_at_index < total_entries and start_page_index < max_pages and start_page_index < estimated_max_pages:

        file_name = project + '_' + str(start_page_index) + '.json'

        if load_type == 'Bulk':
            query_string_template = "?jql=project={0}{1}&startAt={2}&maxResults={3}&fields=*all&expand=renderedFields,names,schema,transitions,operations,changelog,projects.issuetypes.fields&fieldsByKeys=true"
            date_condition = "&updated<='" + end_date + "'"
            escaped_date_condition = urllib.parse.quote(date_condition, safe='')
            # Replace variables in query string template
            query_string = query_string_template.format(project,
                                                        escaped_date_condition,
                                                        start_at_index,
                                                        result_size)
        else:
            query_string_template = "?jql=project={0}{1}{2}&startAt={3}&maxResults={4}&fields=*all&expand=renderedFields,names,schema,transitions,operations,changelog,projects.issuetypes.fields&fieldsByKeys=true"
            date_condition1 = "&updated>'" + start_date + "'"
            date_condition2 = "&updated<='" + end_date + "'"
            escaped_date_condition1 = urllib.parse.quote(date_condition1, safe='')
            escaped_date_condition2 = urllib.parse.quote(date_condition2, safe='')
            # Replace variables in query string template
            query_string = query_string_template.format(project,
                                                        escaped_date_condition1,
                                                        escaped_date_condition2,
                                                        start_at_index,
                                                        result_size)

        logger.info("Retrieving data: {0}".format(endpoint_url + query_string))

        try:
            data = query_endpoint(query_string, endpoint_url, consumer, client_token)
        except Exception as data_err:
            logger.error("Unable to retrieve data: {0}".format(str(data_err)))
            data = {}
            break

        # Extract Data Records
        issues_list = data.get('issues', [])

        if len(issues_list) > 0:
            logger.info("Retrieved {0} pages out of max {1}".format(start_page_index, max_pages))

        # Increment counters
        num_entries += len(issues_list)
        start_at_index += len(issues_list)
        start_page_index += 1

        # Handle throttling
        if len(issues_list) > 0:
            logger.info("Sleep for {0} seconds.".format(api_sleep_in_seconds))
            time.sleep(api_sleep_in_seconds)

        # Handle serialization
        if len(issues_list) > 0:
            # Note: I want this to fail hard
            try:
                json_issues_list = json.dumps(issues_list)
            except Exception as json_err:
                logger.error("Unable to serialize issues_list to json: {0}".format(str(json_err)))
                json_issues_list = None

            if json_issues_list is not None and type(json_issues_list) is str:
                try:
                    s3.put_object(Bucket=output_s3_bucket, Key=key_prefix + file_name, Body=json_issues_list)
                except Exception as e:
                    logger.error("Unable to write JSON to bucket {0} and key {1} because {2}".format(output_s3_bucket,
                                                                                                    key_prefix + file_name,
                                                                                                    str(e)))

    logger.info("Completed call to Search API for project {0}".format(project))
    return num_entries


#### Create query_endpoint which will be called from get_data function
- This function will hit the JIRA RESTApi end point to extract the data and return it to get_data function

In [None]:
def query_endpoint(query_string, endpoint_url, consumer, client_token):
    """Query JIRA endpoint.

    :param endpoint_url:
    :param query_string:
    :param access_token: Output of get_access_token
    :param consumer_key:
    :param consumer_secret:
    :return:
    """
    # Create oauth client
    client = oauth.Client(consumer, client_token)
    # Create Signature for hitting JIRA End Point
    client.set_signature_method(SignatureMethod_RSA_SHA1())

    # Retrieve data
    logger.info("Attempting to retrieve data from endpoint '{0}' with query '{1}'".format(endpoint_url,
                                                                                          query_string))

    data_url = endpoint_url + query_string

    logger.info("Requesting data: {0}".format(data_url))

    data_resp, data_content = None, None
    try:
        data_resp, data_content = client.request(data_url, "GET")
    except Exception as dreq_err:
        logger.error("Unable to retrieve data: response={0}. Error: {1}".format(data_resp, str(dreq_err)))

    logger.info("Response from service endpoint: {0}".format(data_resp))

    content = None
    if data_content is not None:
        logger.info("Attempting to deserialize data")
        # Output needs to be decoded from bytes to utf8 string.
        content = json.loads(data_content.decode('utf8'))

    return content


#### main function
- Loops through Projects in Scope for data extraction from JIRA
- Inside each loop:
    - Identifies the Load Type for the Project (Bulk vs Incremental)
    - In case of Incremental Load Type Identifies Start Date & End Date i.e. date range for which data needs to be extracted from JIRA
    - Invokes get_data for data extraction
    - Updates DynamoDB with new last_ingestion date which will be set to start date for next execution
- Define this as main function inside glue Job or as lambda handler inside lambda function

In [None]:
# Define variables DynamoDB, these can be passed as Glue Job Parameter or Lambda Function Environment Variable
# or stored as ssm parameter
dynamodb_table = 'jira_batch_tracking'
dynamodb_partition_key = 'jira_project_name'

# Define AWS Region, this needs to be identified and passed dynamically as aprt of CI/CD code deployment process
aws_region = 'us-east-1'

# Get the values of JIRA projects, these can be passed as Glue Job Parameter or Lambda Function Environment Variable
# or stored as ssm parameter
jira_projects = 'Test_Project_1, Test_Project_2, Test_Project_3'

# Convert the Project String into Project List
jira_project_list = jira_projects.split(',')

# Specify the Timezone in which JIRA is set-up
est = timezone('EST')

# Calculate Current System date
current_DT = datetime.datetime.now(est)
year_DT = current_DT.year
month_DT = current_DT.month
day_DT = current_DT.day
hour_DT = current_DT.hour + 2
minute_DT = current_DT.minute

# Prefix Month, Day and Hour with 0 in case it's a single digit value
month_DT_str = '{0:02}'.format(month_DT)
day_DT_str = '{0:02}'.format(day_DT)
hour_DT_str = '{0:02}'.format(hour_DT)

start_date = ''
s3_end_date_prefix = ''

# Calculate the Datetime till JIRA data will be pulled
# This is rounded upto the hour
end_date = "{0}-{1}-{2} {3}:00".format(year_DT,month_DT_str,day_DT_str,hour_DT_str)
logger.info("Value of End Date is - {0}".format(end_date))
s3_end_date_prefix = 'yr=' + str(year_DT) + '/mo=' + str(month_DT) + '/dt=' + str(day_DT) + '/hr=' + str(
    hour_DT) + ':00'

# Call get_data function to extract data from JIRA one Project at a time
for project in jira_project_list:
    logger.info("JIRA Data Pull for Project - {0}".format(project))

    # Query Project From DynamoDB Tracking Table
    dynamodb_client = boto3.resource('dynamodb', region_name=aws_region)
    db_mtable = dynamodb_client.Table(dynamodb_table)
    response = db_mtable.query(
        KeyConditionExpression=Key(dynamodb_partition_key).eq(project)
    )

    # If Project is Not Present in DynamoDB set Load Type as Bulk
    # All the data from Jira will be extracted to s3
    if response['Count'] == 0:
        # Define Load Type as History as DynamoDB doesn't have entry for the Project
        load_type = 'Bulk'
        # Call get_data function to extract data for the Project from JIRA
        total_entries = get_data(project, load_type, start_date, end_date, s3_end_date_prefix)

    # If Project is Present in DynamoDB set Load Type as Incremental
    # Set start_date as last_ingest_date from DynamoDB Table
    # Only JIRA Stories modified between start_date and end_date will be extracted to s3
    else:
        # Define Load Type as Incremental as DynamoDB has entry for the Project
        load_type = 'Incremental'
        # Get the Last Ingestion Date from DynamoDB response object and assign it to Start Date
        start_date = response['Items'][0]['last_ingest_date']
        logger.info("Value of Start Date is - {0}".format(start_date))
        # Call get_data function to extract data for the Project from JIRA
        total_entries = get_data(project, load_type, start_date, end_date, s3_end_date_prefix)

    # Insert/Update Entry in DynamoDB for the Project and set end_date as last_ingest_date
    put_response = db_mtable.put_item(
        Item={
            dynamodb_partition_key: project,
            'last_ingest_date': end_date
        }
    )
    logger.info("Value of DynamoDB Put Response is - {0}".format(put_response))
    logger.info(
        "JIRA Data Pull for Project - {0} completed and total {1} records loaded to landing bucket".format(project,
                                                                                                        total_entries))