29 lines
1.1 KiB
Python
29 lines
1.1 KiB
Python
import requests
|
|
from lxml import html
|
|
from lxml import etree
|
|
|
|
useragent_headers={
|
|
"User-agent":'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
|
|
}
|
|
|
|
site_tree = html.fromstring(requests.get("https://habr.com/", headers=useragent_headers).text)
|
|
|
|
habr_news = site_tree.xpath('//div[@class="new-block"]//ul[@class="content-list"]/li[@class="content-list__item content-list__item_news-topic"]' +
|
|
'//a[@class="news-topic__title"]')
|
|
|
|
for habr_new in habr_news:
|
|
|
|
try:
|
|
article_text = etree.tostring(
|
|
html.fromstring(requests.get(habr_new.xpath('./@href')[0], headers=useragent_headers).text).xpath(
|
|
'//div[@class="post__text post__text-html post__text_v1" and @id="post-content-body"]')[0]).decode("utf-8")
|
|
except Exception:
|
|
continue
|
|
|
|
print()
|
|
print("----- beginning of article")
|
|
print(habr_new.text_content())
|
|
print("----- beginning of text")
|
|
print(article_text)
|
|
print("----- end of text")
|
|
print("----- end of article")
|