python-scripts/habr_parser.py

30 lines
1.1 KiB
Python

import requests
from lxml import html
from lxml import etree
useragent_headers={
"User-agent":'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
}
site_tree = html.fromstring(requests.get("https://habr.com/", headers=useragent_headers).text)
habr_news = site_tree.xpath('//div[@class="new-block"]//ul[@class="content-list"]/li[@class="content-list__item content-list__item_news-topic"]' +
'//a[@class="news-topic__title"]')
for habr_new in habr_news:
try:
article_text = etree.tostring(
html.fromstring(requests.get(habr_new.xpath('./@href')[0], headers=useragent_headers).text).xpath(
'//div[@class="post__text post__text-html post__text_v1" and @id="post-content-body"]')[0]).decode("utf-8")
except Exception:
continue
print()
print("----- beginning of article")
print(habr_new.text_content())
print("----- beginning of text")
print(article_text)
print("----- end of text")
print("----- end of article")