Python: Uploading S3 statistics into DynamoDB

Honestly, this is something I will probably never use because I found out that even when uploading data to DynamoDB using Boto is fairly easy, it is much more difficult to analyze data in NoSQL database.

You can use this file e.g. if you want to parse ‘logs’ directory containing S3 access logs running insert_dynamo.py logs

#!/usr/bin/env python3
import boto3, re, json, hashlib, sys, os

dynamodb = boto3.resource('dynamodb', region_name='eu-west-1', endpoint_url="http://dynamodb.eu-west-1.amazonaws.com")

table = dynamodb.Table('s3_logs')

# https://stackoverflow.com/questions/12544510/parsing-apache-log-files
# https://regex101.com/r/cgY3Zu/1

def parse_file(file):
    try:
        regex = '(\w+) ([\w_-]+) \[(.*?)\] ([(\d\.)]+) - (\w+) ([\w\.]*) (.+?) "(.*?)" (\d+) - (\d+) (\d+) (\d+) (\d+) "(.*?)" "(.*?)" -'

        with open(file) as f:
            line = f.readline()

        match = re.match(regex, line).groups()

        response = table.put_item(
           Item={
                'id': hashlib.sha224(line.encode('utf-8')).hexdigest(),
                'repository': match[1],
                'date': match[2],
                'ip': match[3],
                'item': match[6],
                'referer': match[13],
                'agent': match[14]
            }
        )

        print("PutItem succeeded")
    except:
        print("PutItem failed")

inputdir = sys.argv[1]

for file in os.listdir(inputdir):
    parse_file(inputdir + '/' + file)