Source code for src.core.downloader.AmazonBucketManager

#!/usr/bin/env python

# ==================================================================================== #
#  __     ______     __  __     __   __     ______     ______     ______     ______
# /\ \   /\  ___\   /\ \_\ \   /\ "-.\ \   /\  __ \   /\  ___\   /\  __ \   /\__  _\
# \ \ \  \ \ \____  \ \  __ \  \ \ \-.  \  \ \ \/\ \  \ \___  \  \ \  __ \  \/_/\ \/
#  \ \_\  \ \_____\  \ \_\ \_\  \ \_\\"\_\  \ \_____\  \/\_____\  \ \_\ \_\    \ \_\
#   \/_/   \/_____/   \/_/\/_/   \/_/ \/_/   \/_____/   \/_____/   \/_/\/_/     \/_/
#
# ==================================================================================== #
#
# Copyright (c) 2017 Sardegna Clima - Raffaele Bua (buele)
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of
# this software and associated documentation files (the "Software"), to deal in
# the Software without restriction, including without limitation the rights to
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
# of the Software, and to permit persons to whom the Software is furnished to do
# so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#
# ==================================================================================== #


import datetime
import re
from collections import OrderedDict
import urllib.request
import xml.etree.ElementTree as ET
from src.data.logger.logger import logger

__author__ = "Raffaele Bua (buele)"
__copyright__ = "Copyright 2017, Sardegna Clima"
__credits__ = ["Raffaele Bua"]
__license__ = "MIT"
__version__ = "0.1"
__maintainer__ = "Raffaele Bua"
__contact__ = "info@raffaelebua.eu"
__status__ = "Development"

[docs]class AmazonBucketManager:
    """ This class is an Adapter of Amazon AWS services.
        Main goal of this class is **Retrieve the list of products from aws via tile list, and time interval**
    """
    def __init__(self, configurations):

        self.config = configurations
        self.product_list = []
        self.last_item = None
        return

[docs]    def generate_url(self, tile, year):
        """ This method generates the aws url from the tile name string and the start year of filter interval.
            AWS service has the *prefix* parameter, to filter the list of available files in the bucket.
            This method generates the proper *prefix* parameter value, to give the list of files
            available in the year of *year* argument passed in the method.

            The generated parameter is like:

            .. code-block:: bash

                prefix=tiles/{tile}/{year}/

            :param tile: The tile name string with a pattern like

            .. code-block:: bash

                32/T/MK

            :param year: The year
            :returns: request url
            :rtype: String
        """
        url_template = 'http://sentinel-s2-l1c.s3.amazonaws.com/?list-type=2&prefix=tiles/{tile}/{year}/'
        url = url_template.format(tile=tile, year=year)
        return url

[docs]    def extract_date(self, item):
        """ This method extracts the date from the tile name string (complete with the date).

            :param item: The tile name complete string. e.g.

            .. code-block:: bash

                tiles/32/T/MK/YYYY/MM/DD/

            where
                * **YYYY** is the year
                * **MM** is the month
                * **DD** is the day

            :returns: The extracted date form the tile name string
            :rtype: Datetime
            The method apply the regex:

            .. code-block:: bash

                tiles/[0-9]2/[A-Z]/[A-Z]{2}/([0-9]{4})/([0-9]*)/([0-9]*)/

            to the *item* parameter
        """
        regex = 'tiles/[0-9]2/[A-Z]/[A-Z]{2}/([0-9]{4})/([0-9]*)/([0-9]*)/'
        match = re.search(regex, item)
        year = match.group(1)
        month = match.group(2)
        day = match.group(3)
        return datetime.date(int(year), int(month), int(day))

[docs]    def load_products(self, tile, year, paginated=False):
        """ *load_products* load list products form the Amazon *AWS* Bucket of Sinergise *Sentinel-2 on AWS*.
            This method contains the *AWS* APIs semantic it is the core of Adapter.
            It manages also pagination.

            :param tile: Tile name
            :type tile: String
            :param year: Year of interest, the method extract the whole list of products for this year
            :type year: String
            :param paginated: if this parameter is *True* that means that the list provided from AWS is
                              paginated and the next page starts after the *last_item*
            :type year: Boolean
            :returns: None
            :rtype: None

            This method stores the list of products in the classes property *product_list*.
        """
        logger.debug("(AmazonBucketManager load_products)")
        # Generate url for year
        url = self.generate_url(tile, year)
        # Append to url start_from attribute if the request is paginated
        if paginated:
            url = url + "&start-after=" + self.last_item
        # Http request
        response = urllib.request.urlopen(url)
        root = ET.fromstring(response.read().decode('utf-8'))
        # Extract data from xml
        contents = root.findall('{'+self.config.aws_xmlns+'}Contents')
        for item in contents:
            key = item.find('{' + self.config.aws_xmlns + '}Key').text
            product_path = ''
            try:
                product_path = re.search(self.config.aws_products_regex, key)
            except ValueError:
                logger.warn("Product not found for key: " + key)
            self.product_list.append(product_path.group(0))
        # Check if the page is truncated (paginated case)
        if root.find('{' + self.config.aws_xmlns + '}IsTruncated').text == 'true':
            self.last_item = contents[-1].find('{' + self.config.aws_xmlns + '}Key').text
            return True
        else:
            return False

[docs]    def get_products_list(self, searchFilter):
        """ This method manages the retrieval of aws products.
            This method contains the *AWS* APIs semantic it is the core of Adapter.
            It manages also pagination.

            :param searchFilter: Search parameters to filter the available files in the bucket
            :type searchFilter: SearchFilter

            :returns: The pending products available in the Amazon bucket.
            :rtype: None

            .. todo:: Optimize list retrieval using set end date year instead *NOW* year.
        """
        logger.debug("(AmazonBucketManager get_products_list)")
        tile = searchFilter.tile
        self.product_list = []
        self.last_item = None
        year = int(searchFilter.start_date.year)
        current_year = datetime.datetime.now().year
        pending_products = []
        # Extract whole list of products via amazon, from start year to now
        # TODO: OPTMIZE IT USING THE END DATE FROM CONFIGURATION AND NOT *NOW*
        while year <= current_year:
            is_truncated = self.load_products(tile, year, False)
            while is_truncated:
                is_truncated = self.load_products(tile, year, True)
            year += 1
        # Clean list of products
        self.product_list = set(self.product_list)
        # Generate dictionary
        dict = {}
        for product in self.product_list:
            date = self.extract_date(str(product))
            product_string = str(product)
            dict[date] = product_string
        # Sort dictionary
        dict = OrderedDict(sorted(dict.items()))
        # Filter products via date interval
        for product_date in dict:
            if product_date >= searchFilter.start_date and product_date <= searchFilter.end_date:
                pending_products.append(dict[product_date])
        return pending_products