generate-domains-blocklist: use the same name for the directory and the tool

This commit is contained in:
Frank Denis 2020-12-18 21:27:33 +01:00
parent 77f81cc8c2
commit c17461ed42
5 changed files with 2 additions and 0 deletions

View file

@ -0,0 +1,76 @@
163.com
a-msedge.net
amazon.com
app.link
apple.com
appsflyer.com
azurewebsites.net
baidu.com
bankofamerica.com
cdn.cloudflare.net
cdn.optimizely.com
cdnetworks.com
cdninstagram.com
ce5.at
cloudapp.net
demon-networks.com
dl.360safe.com
download.dnscrypt.info
edgekey.net
edgesuite.net
elasticbeanstalk.com
eliteoz.com.au
fastly.net
github.com
github.io
googleadservices.com
guiltfreefoodguide.com
gvt1.com
gvt2.com
heritagehighway.com.au
invalid
j.mp
kundencenter.telekom.de
l-msedge.net
labarchitects.com
lan
liveinternet.ru
localdomain
localhost
login.microsoftonline.com
login.yahoo.com
mdidesign.ca
microsoft.com
mobiledl.adobe.com
msedge.net
msftconnecttest.com
msftncsi.com
nsatc.net
ocsp.apple.com
onedrive.live.com
outlook.live.com
ovh.net
paypal.com
pbs.twimg.com
polyfill.io
prod.msocdn.com
pusher.com
pusherapp.com
qualtrics.com
raw.githubusercontent.com
rcollard.com
revinate.com
s.youtube.com
smtp.mail.yahoo.com
someonewhocares.org
spotify.com
static.parastorage.com
storage.googleapis.com
syandus.com
tagcommander.com
urldefense.proofpoint.com
userbenchmark.com
vk.com
vlab.pp.ru
windows.net
youtu.be

View file

@ -0,0 +1,41 @@
# Local set of patterns to block
ad.*
ads.*
banner.*
banners.*
creatives.*
oas.*
oascentral.*
tag.*
telemetry.*
tracker.*
# My Macbook constantly sends a lot of useless queries to *.local,
# so I block them. *.lan is apparently another common one, and
# *.localdomain and *.workgroup are common on Windows.
*.lan
*.local
*.localdomain
*.workgroup
# eth0.me is hardcoded in tools such as Archey, but is not available any
# more, causing issues such as terminal sessions taking a long time to
# start.
eth0.me
# ibpxl.com is a tracker that seems to frequently have issues, causing
# page loads to stall.
ibpxl.com
# ditto for that one
internetbrands.com
# Ubuntu's motd script sends way too much information to Canonical
motd.ubuntu.com

View file

@ -0,0 +1,170 @@
##################################################################################
# #
# Generate a block list of domains using public data sources, and the local #
# domains-blocklist-local-additions.txt file. #
# #
# The default configuration is just indicative, and corresponds to the one #
# used to produce the public "mybase" set. #
# #
# Comment out the URLs of the sources you wish to disable, leave the ones #
# you would like enabled uncommented. Then run the script to build the #
# dnscrypt-blocklist-domains.txt file: #
# #
# $ generate-domains-blocklist.py > dnscrypt-blacklist-domains.txt #
# #
# Domains that should never be blocked can be put into a file named #
# domains-allowlist.txt. #
# #
# That blocklist file can then be used in the dnscrypt-proxy.toml file: #
# #
# [blocklist] #
# #
# blocklist_file = 'dnscrypt-blocklist-domains.txt' #
# #
##################################################################################
# Local additions
file:domains-blocklist-local-additions.txt
# AdAway is an open source ad blocker for Android using the hosts file.
# https://raw.githubusercontent.com/AdAway/adaway.github.io/master/hosts.txt
# Malware domains
https://mirror1.malwaredomains.com/files/justdomains
# Malware Domain List
https://www.malwaredomainlist.com/hostslist/hosts.txt
# EasyList
# https://easylist-downloads.adblockplus.org/easylist_noelemhide.txt
# EasyList China
# https://easylist-downloads.adblockplus.org/easylistchina.txt
# RU AdList
# https://easylist-downloads.adblockplus.org/advblock.txt
# Peter Lowe's Ad and tracking server list
https://pgl.yoyo.org/adservers/serverlist.php?hostformat=nohtml
# Spam404
# https://raw.githubusercontent.com/Spam404/lists/master/main-blacklist.txt
# Malvertising filter list by Disconnect
# https://s3.amazonaws.com/lists.disconnect.me/simple_malvertising.txt
# Ads filter list by Disconnect
# https://s3.amazonaws.com/lists.disconnect.me/simple_ad.txt
# Basic tracking list by Disconnect
# https://s3.amazonaws.com/lists.disconnect.me/simple_tracking.txt
# Sysctl list (ads)
# http://sysctl.org/cameleon/hosts
# KAD host file (fraud/adware) without controversies
# https://raw.githubusercontent.com/PolishFiltersTeam/KADhosts/master/KADhosts_without_controversies.txt
# BarbBlock list (spurious and invalid DMCA takedowns)
https://paulgb.github.io/BarbBlock/blacklists/domain-list.txt
# Dan Pollock's hosts list
# https://someonewhocares.org/hosts/hosts
# NoTracking's list - blocking ads, trackers and other online garbage
https://raw.githubusercontent.com/notracking/hosts-blocklists/master/dnscrypt-proxy/dnscrypt-proxy.blacklist.txt
# NextDNS CNAME cloaking list
https://raw.githubusercontent.com/nextdns/cname-cloaking-blocklist/master/domains
# AdGuard Simplified Domain Names filter
# https://adguardteam.github.io/AdGuardSDNSFilter/Filters/filter.txt
# Geoffrey Frogeye's block list of first-party trackers - https://hostfiles.frogeye.fr/
https://hostfiles.frogeye.fr/firstparty-trackers.txt
# CoinBlockerLists: blocks websites serving cryptocurrency miners - https://gitlab.com/ZeroDot1/CoinBlockerLists/ - Contains false positives
# https://gitlab.com/ZeroDot1/CoinBlockerLists/raw/master/list_browser.txt
# Websites potentially publishing fake news
# https://raw.githubusercontent.com/marktron/fakenews/master/fakenews
# Quidsup NoTrack Blocklist - Contains too many false positives to be enabled by default
# https://gitlab.com/quidsup/notrack-blocklists/raw/master/notrack-blocklist.txt
# Quidsup Malware Blocklist - Contains too many false positives to be enabled by default
# https://gitlab.com/quidsup/notrack-blocklists/raw/master/notrack-malware.txt
# AntiSocial Blacklist is an extensive collection of potentially malicious domains
# https://theantisocialengineer.com/AntiSocial_Blacklist_Community_V1.txt
# Steven Black hosts file
# https://raw.githubusercontent.com/StevenBlack/hosts/master/hosts
# A list of adserving and tracking sites maintained by @anudeepND
# https://raw.githubusercontent.com/anudeepND/blacklist/master/adservers.txt
# Anudeep's Blacklist (CoinMiner) - Blocks cryptojacking sites
# https://raw.githubusercontent.com/anudeepND/blacklist/master/CoinMiner.txt
# Block Spotify ads
# https://gitlab.com/CHEF-KOCH/cks-filterlist/-/raw/master/Anti-Corp/Spotify/Spotify-HOSTS.txt
### Spark < Blu Go < Blu < Basic < Ultimate
### (With pornware blocking) Porn < Unified
# Energized Ultimate
# https://block.energized.pro/ultimate/formats/domains.txt
# Energized Basic
# https://block.energized.pro/basic/formats/domains.txt
# Energized BLU
# https://block.energized.pro/blu/formats/domains.txt
# OISD.NL - Blocks ads, phishing, malware, tracking and more. Tries to minimize false positives.
https://dbl.oisd.nl/
# OISD.NL (smaller subset) - Blocks ads, phishing, malware, tracking and more.
# https://hosts.oisd.nl/light
# Captain Miao ad list - Block ads and trackers, especially Chinese and Android trackers
# https://raw.githubusercontent.com/jdlingyu/ad-wars/master/sha_ad_hosts
# Dynamic DNS services, sadly often used by malware
# https://mirror1.malwaredomains.com/files/dynamic_dns.txt
# Phishing Army - https://phishing.army/
# https://phishing.army/download/phishing_army_blocklist.txt
# Block pornography
# https://raw.githubusercontent.com/Clefspeare13/pornhosts/master/0.0.0.0/hosts
# https://raw.githubusercontent.com/Sinfonietta/hostfiles/master/pornography-hosts
# https://raw.githubusercontent.com/cbuijs/shallalist/master/porn/domains
# https://raw.githubusercontent.com/olbat/ut1-blacklists/master/blacklists/adult/domains
# https://block.energized.pro/porn/formats/domains.txt
# https://raw.githubusercontent.com/mhxion/pornaway/master/hosts/porn_sites.txt
# Block gambling sites
# https://raw.githubusercontent.com/Sinfonietta/hostfiles/master/gambling-hosts
# https://raw.githubusercontent.com/olbat/ut1-blacklists/master/blacklists/gambling/domains
# Block dating websites
# https://raw.githubusercontent.com/olbat/ut1-blacklists/master/blacklists/dating/domains
# Block social media sites
# https://raw.githubusercontent.com/Sinfonietta/hostfiles/master/social-hosts
# https://block.energized.pro/extensions/social/formats/domains.txt
# https://raw.githubusercontent.com/olbat/ut1-blacklists/master/blacklists/social_networks/domains
# Goodbye Ads - Specially designed for mobile ad protection
# https://raw.githubusercontent.com/jerryn70/GoodbyeAds/master/Hosts/GoodbyeAds.txt
# NextDNS BitTorrent blocklist
# https://raw.githubusercontent.com/nextdns/bittorrent-blocklist/master/domains
# Block spying and tracking on Windows
# https://raw.githubusercontent.com/crazy-max/WindowsSpyBlocker/master/data/dnscrypt/spy.txt
# GameIndustry.eu - Block spyware, advertising, analytics, tracking in games and associated clients
# https://www.gameindustry.eu/files/hosts.txt

View file

@ -0,0 +1,10 @@
## Rules to be applied at specific times
##
## This requires a time schedule to be defined in the
## dnscrypt-proxy.toml configuration file.
##
## This file must be loaded using the --time-restricted command-line flag of the generate-domains-blocklist.py utility.
# twitter.com @work
# facebook.com @work
# *.youtube.* @time-to-sleep

View file

@ -0,0 +1,337 @@
#! /usr/bin/env python3
# run with python generate-domains-blocklist.py > list.txt.tmp && mv -f list.txt.tmp list
from __future__ import print_function
import argparse
import re
import sys
import fnmatch
try:
import urllib2 as urllib
URLLIB_NEW = False
except (ImportError, ModuleNotFoundError):
import urllib.request as urllib
from urllib.request import Request
URLLIB_NEW = True
def parse_trusted_list(content):
rx_comment = re.compile(r"^(#|$)")
rx_inline_comment = re.compile(r"\s*#\s*[a-z0-9-].*$")
rx_trusted = re.compile(r"^([*a-z0-9.-]+)\s*(@\S+)?$")
names = set()
time_restrictions = {}
globs = set()
rx_set = [rx_trusted]
for line in content.splitlines():
line = str.lower(str.strip(line))
if rx_comment.match(line):
continue
line = str.strip(rx_inline_comment.sub("", line))
if is_glob(line):
globs.add(line)
names.add(line)
continue
for rx in rx_set:
matches = rx.match(line)
if not matches:
continue
name = matches.group(1)
names.add(name)
time_restriction = matches.group(2)
if time_restriction:
time_restrictions[name] = time_restriction
return names, time_restrictions, globs
def parse_list(content, trusted=False):
if trusted:
return parse_trusted_list(content)
rx_comment = re.compile(r"^(#|$)")
rx_inline_comment = re.compile(r"\s*#\s*[a-z0-9-].*$")
rx_u = re.compile(
r"^@*\|\|([a-z0-9][a-z0-9.-]*[.][a-z]{2,})\^?(\$(popup|third-party))?$"
)
rx_l = re.compile(r"^([a-z0-9][a-z0-9.-]*[.][a-z]{2,})$")
rx_h = re.compile(
r"^[0-9]{1,3}[.][0-9]{1,3}[.][0-9]{1,3}[.][0-9]{1,3}\s+([a-z0-9][a-z0-9.-]*[.][a-z]{2,})$"
)
rx_mdl = re.compile(r'^"[^"]+","([a-z0-9][a-z0-9.-]*[.][a-z]{2,})",')
rx_b = re.compile(r"^([a-z0-9][a-z0-9.-]*[.][a-z]{2,}),.+,[0-9: /-]+,")
rx_dq = re.compile(r"^address=/([a-z0-9][a-z0-9.-]*[.][a-z]{2,})/.")
names = set()
time_restrictions = {}
globs = set()
rx_set = [rx_u, rx_l, rx_h, rx_mdl, rx_b, rx_dq]
for line in content.splitlines():
line = str.lower(str.strip(line))
if rx_comment.match(line):
continue
line = str.strip(rx_inline_comment.sub("", line))
for rx in rx_set:
matches = rx.match(line)
if not matches:
continue
name = matches.group(1)
names.add(name)
return names, time_restrictions, globs
def print_restricted_name(output_fd, name, time_restrictions):
if name in time_restrictions:
print("{}\t{}".format(name, time_restrictions[name]), file=output_fd, end="\n")
else:
print(
"# ignored: [{}] was in the time-restricted list, "
"but without a time restriction label".format(name),
file=output_fd,
end="\n",
)
def load_from_url(url):
sys.stderr.write("Loading data from [{}]\n".format(url))
req = urllib.Request(url=url, headers={"User-Agent": "dnscrypt-proxy"})
trusted = False
if URLLIB_NEW:
req_type = req.type
else:
req_type = req.get_type()
if req_type == "file":
trusted = True
response = None
try:
response = urllib.urlopen(req, timeout=int(args.timeout))
except urllib.URLError as err:
raise Exception("[{}] could not be loaded: {}\n".format(url, err))
if trusted is False and response.getcode() != 200:
raise Exception("[{}] returned HTTP code {}\n".format(url, response.getcode()))
content = response.read()
if URLLIB_NEW:
content = content.decode("utf-8", errors="replace")
return content, trusted
def name_cmp(name):
parts = name.split(".")
parts.reverse()
return str.join(".", parts)
def is_glob(pattern):
maybe_glob = False
for i in range(len(pattern)):
c = pattern[i]
if c == "?" or c == "[":
maybe_glob = True
elif c == "*" and i != 0:
if i < len(pattern) - 1 or pattern[i - 1] == ".":
maybe_glob = True
if maybe_glob:
try:
fnmatch.fnmatch("example", pattern)
return True
except:
pass
return False
def covered_by_glob(globs, name):
if name in globs:
return False
for glob in globs:
try:
if fnmatch.fnmatch(name, glob):
return True
except:
pass
return False
def has_suffix(names, name):
parts = str.split(name, ".")
while parts:
parts = parts[1:]
if str.join(".", parts) in names:
return True
return False
def allowlist_from_url(url):
if not url:
return set()
content, trusted = load_from_url(url)
names, _time_restrictions, _globs = parse_list(content, trusted)
return names
def blocklists_from_config_file(
file, allowlist, time_restricted_url, ignore_retrieval_failure, output_file
):
blocklists = {}
allowed_names = set()
all_names = set()
unique_names = set()
all_globs = set()
# Load conf & blocklists
with open(file) as fd:
for line in fd:
line = str.strip(line)
if str.startswith(line, "#") or line == "":
continue
url = line
try:
content, trusted = load_from_url(url)
names, _time_restrictions, globs = parse_list(content, trusted)
blocklists[url] = names
all_names |= names
all_globs |= globs
except Exception as e:
sys.stderr.write(str(e))
if not ignore_retrieval_failure:
exit(1)
# Time-based blocklist
if time_restricted_url and not re.match(r"^[a-z0-9]+:", time_restricted_url):
time_restricted_url = "file:" + time_restricted_url
output_fd = sys.stdout
if output_file:
output_fd = open(output_file, "w")
if time_restricted_url:
time_restricted_content, _trusted = load_from_url(time_restricted_url)
time_restricted_names, time_restrictions, _globs = parse_trusted_list(
time_restricted_content
)
if time_restricted_names:
print(
"########## Time-based blocklist ##########\n", file=output_fd, end="\n"
)
for name in time_restricted_names:
print_restricted_name(output_fd, name, time_restrictions)
# Time restricted names should be allowed, or they could be always blocked
allowed_names |= time_restricted_names
# Allowed list
if allowlist and not re.match(r"^[a-z0-9]+:", allowlist):
allowlist = "file:" + allowlist
allowed_names |= allowlist_from_url(allowlist)
# Process blocklists
for url, names in blocklists.items():
print(
"\n\n########## Blocklist from {} ##########\n".format(url),
file=output_fd,
end="\n",
)
ignored, glob_ignored, allowed = 0, 0, 0
list_names = list()
for name in names:
if covered_by_glob(all_globs, name):
glob_ignored = glob_ignored + 1
elif has_suffix(all_names, name) or name in unique_names:
ignored = ignored + 1
elif has_suffix(allowed_names, name) or name in allowed_names:
allowed = allowed + 1
else:
list_names.append(name)
unique_names.add(name)
list_names.sort(key=name_cmp)
if ignored:
print("# Ignored duplicates: {}".format(ignored), file=output_fd, end="\n")
if glob_ignored:
print(
"# Ignored due to overlapping local patterns: {}".format(glob_ignored),
file=output_fd,
end="\n",
)
if allowed:
print(
"# Ignored entries due to the allowlist: {}".format(allowed),
file=output_fd,
end="\n",
)
if ignored or glob_ignored or allowed:
print(file=output_fd, end="\n")
for name in list_names:
print(name, file=output_fd, end="\n")
output_fd.close()
argp = argparse.ArgumentParser(
description="Create a unified blocklist from a set of local and remote files"
)
argp.add_argument(
"-c",
"--config",
default="domains-blocklist.conf",
help="file containing blocklist sources",
)
argp.add_argument(
"-w",
"--whitelist",
help=argparse.SUPPRESS,
)
argp.add_argument(
"-a",
"--allowlist",
default="domains-allowlist.txt",
help="file containing a set of names to exclude from the blocklist",
)
argp.add_argument(
"-r",
"--time-restricted",
default="domains-time-restricted.txt",
help="file containing a set of names to be time restricted",
)
argp.add_argument(
"-i",
"--ignore-retrieval-failure",
action="store_true",
help="generate list even if some urls couldn't be retrieved",
)
argp.add_argument(
"-o",
"--output-file",
default=None,
help="save generated blocklist to a text file with the provided file name",
)
argp.add_argument("-t", "--timeout", default=30, help="URL open timeout")
args = argp.parse_args()
whitelist = args.whitelist
if whitelist:
print("The option to provide a set of names to exclude from the blocklist has been changed from -w to -a\r\n")
argp.print_help()
exit(1)
conf = args.config
allowlist = args.allowlist
time_restricted = args.time_restricted
ignore_retrieval_failure = args.ignore_retrieval_failure
output_file = args.output_file
blocklists_from_config_file(
conf, allowlist, time_restricted, ignore_retrieval_failure, output_file
)