import re
import requests
import urllib.parse
from lxml import html

# Get web page with Russian cities from RP5
russian_cities_page = requests.get('https://rp5.ru/Погода_в_России')
russian_cities_tree = html.fromstring(russian_cities_page.content)

# Extract the table with cities
columns = russian_cities_tree.xpath('//div[@class="countryMap"]//div[@class="countryMap-cell"]')

# File
cities_file = open('russia.csv', 'wt')
cities_file.write('ID,City\n')

# Write all identifiers and cities
city_link_regex = re.compile(r'^https*://rp5.ru/(Погода_[А-Яа-я\w.,\-()]+)')
added_ids = []
for col in columns:
    ids = col.xpath('.//span[@class="Ajax-PointID"]//@id')
    for city in ids:

        str_to_write = ''
        # Check identifier
        for added in added_ids:
            if city == added:
                break
        else:
            # Write identifier
            added_ids.append(city)
            str_to_write += str(city)

            # Comment this block to execute script faster and prevent DDoS detecting on RP5 server
            city_response = requests.get('https://rp5.ru/town.php?id=' + city)
            redirect = urllib.parse.unquote(city_response.url)
            print(redirect)
            str_to_write += (',' + city_link_regex.search(redirect).group(1))

            # Write to file
            cities_file.write(str_to_write + '\n')
            print(city)

cities_file.close()
print('')
input('Press ENTER to exit ...')