python-scripts/rp5id_parser/main.py

47 lines
1.5 KiB
Python
Raw Normal View History

2021-06-08 16:28:01 +04:00
import re
import requests
import urllib.parse
from lxml import html
# Get web page with Russian cities from RP5
russian_cities_page = requests.get('https://rp5.ru/Погода_в_России')
russian_cities_tree = html.fromstring(russian_cities_page.content)
# Extract the table with cities
columns = russian_cities_tree.xpath('//div[@class="countryMap"]//div[@class="countryMap-cell"]')
# File
cities_file = open('russia.csv', 'wt')
2021-06-08 16:28:01 +04:00
cities_file.write('ID,City\n')
# Write all identifiers and cities
city_link_regex = re.compile(r'^https*://rp5.ru/(Погода_[А-Яа-я\w.,\-()]+)')
2021-06-08 16:28:01 +04:00
added_ids = []
for col in columns:
ids = col.xpath('.//span[@class="Ajax-PointID"]//@id')
for city in ids:
str_to_write = ''
# Check identifier
for added in added_ids:
if city == added:
break
else:
# Write identifier
added_ids.append(city)
str_to_write += str(city)
# Comment this block to execute script faster and prevent DDoS detecting on RP5 server
city_response = requests.get('https://rp5.ru/town.php?id=' + city)
redirect = urllib.parse.unquote(city_response.url)
print(redirect)
str_to_write += (',' + city_link_regex.search(redirect).group(1))
# Write to file
cities_file.write(str_to_write + '\n')
print(city)
cities_file.close()
print('')
input('Press ENTER to exit ...')