diff --git a/utils/generate-domains-blacklists/domains-blacklist-all.conf b/utils/generate-domains-blacklists/domains-blacklist-all.conf new file mode 100644 index 00000000..b39d0f08 --- /dev/null +++ b/utils/generate-domains-blacklists/domains-blacklist-all.conf @@ -0,0 +1,93 @@ + +################################################################################## +# # +# Generate a black list of domains using public data sources, and the local # +# domains-blacklist-local-additions.txt file. # +# # +# Comment the URLs of the sources you want to disable, and run the script to # +# build the dnscrypt-blacklist-domains.txt file: # +# # +# $ generate-domains-blacklist.py > dnscrypt-blacklist-domains.txt # +# # +# That blacklist file can then be used in the dnscrypt-proxy configuration: # +# # +# BlackList domains:/etc/dnscrypt-blacklist-domains.txt # +# # +################################################################################## + +# Local additions +file:domains-blacklist-local-additions.txt + +# Bambenek malware C2s +http://osint.bambenekconsulting.com/feeds/c2-dommasterlist.txt + +# hpHosts’ Ad and tracking servers +http://hosts-file.net/.%5Cad_servers.txt + +# Malware domains +http://mirror1.malwaredomains.com/files/justdomains + +# Abuse.ch Ransomware Tracker +http://ransomwaretracker.abuse.ch/downloads/RW_DOMBL.txt + +# Malware Domain List +http://www.malwaredomainlist.com/mdlcsv.php?inactive=off + +# Adblock Warning Removal List +https://easylist-downloads.adblockplus.org/antiadblockfilters.txt + +# EasyList +https://easylist-downloads.adblockplus.org/easylist_noelemhide.txt + +# EasyList China +https://easylist-downloads.adblockplus.org/easylistchina.txt + +# Fanboy’s Social Blocking List +https://easylist-downloads.adblockplus.org/fanboy-social.txt + +# Peter Lowe’s Ad and tracking server list +https://pgl.yoyo.org/adservers/serverlist.php + +# Spam404 +https://raw.githubusercontent.com/Dawsey21/Lists/master/adblock-list.txt + +# CJX Annoyance List +https://raw.githubusercontent.com/cjx82630/cjxlist/master/cjxlist.txt + +# EU: Prebake - Filter Obtrusive Cookie Notices +https://raw.githubusercontent.com/liamja/Prebake/master/obtrusive.txt + +# Malvertising filter list by Disconnect +https://s3.amazonaws.com/lists.disconnect.me/simple_malvertising.txt + +# Malware filter list by Disconnect +https://s3.amazonaws.com/lists.disconnect.me/simple_malware.txt + +# Basic tracking list by Disconnect +https://s3.amazonaws.com/lists.disconnect.me/simple_tracking.txt + +# Quidsup NoTrack +https://raw.githubusercontent.com/quidsup/notrack/master/trackers.txt + +# Sysctl list (ads) +http://sysctl.org/cameleon/hosts + +# KAD host file (fraud/adware) - https://github.com/azet12/KADhosts +https://raw.githubusercontent.com/azet12/KADhosts/master/KADhosts.txt + +# Fake news sites +https://raw.githubusercontent.com/marktron/fakenews/master/fakenews + +# Dynamic DNS services, sadly often used by malware +http://mirror2.malwaredomains.com/files/dynamic_dns.txt + +# Block pornography +https://raw.githubusercontent.com/Clefspeare13/pornhosts/master/0.0.0.0/hosts +https://raw.githubusercontent.com/Sinfonietta/hostfiles/master/pornography-hosts +http://securemecca.com/Downloads/hosts.txt + +# Block gambling sites +https://raw.githubusercontent.com/Sinfonietta/hostfiles/master/gambling-hosts + +# Block social media sites +# https://raw.githubusercontent.com/Sinfonietta/hostfiles/master/social-hosts diff --git a/utils/generate-domains-blacklists/domains-blacklist-local-additions.txt b/utils/generate-domains-blacklists/domains-blacklist-local-additions.txt new file mode 100644 index 00000000..c9eb5d2b --- /dev/null +++ b/utils/generate-domains-blacklists/domains-blacklist-local-additions.txt @@ -0,0 +1,29 @@ + +# Local set of patterns to block + +ad.* +ads.* +banner.* +banners.* +creatives.* +oas.* +oascentral.* +stats.* +tag.* +telemetry.* +tracker.* + +# My Macbook constantly sends a lot of useless queries to *.local, +# so I block them. *.lan is apparently another common one, and +# *.localdomain and *.workgroup are common on Windows. + +*.lan +*.local +*.localdomain +*.workgroup + +# eth0.me is hardcoded in tools such as Archey, but is not available any +# more, causing issues such as terminal sessions taking a long time to +# start. + +eth0.me diff --git a/utils/generate-domains-blacklists/domains-blacklist.conf b/utils/generate-domains-blacklists/domains-blacklist.conf new file mode 100644 index 00000000..7093aad8 --- /dev/null +++ b/utils/generate-domains-blacklists/domains-blacklist.conf @@ -0,0 +1,106 @@ + +################################################################################## +# # +# Generate a black list of domains using public data sources, and the local # +# domains-blacklist-local-additions.txt file. # +# # +# The default configuration is just indicative, and corresponds to the one # +# used to produce the public "mybase" set. # +# # +# Comment out the URLs of the sources you wish to disable, leave the ones # +# you would like enabled uncommented. Then run the script to build the # +# dnscrypt-blacklist-domains.txt file: # +# # +# $ generate-domains-blacklist.py > dnscrypt-blacklist-domains.txt # +# # +# Domains that should never be blocked can be put into a file named # +# domains-whitelist.txt. # +# # +# That blacklist file can then be used in the dnscrypt-proxy configuration: # +# # +# BlackList domains:/etc/dnscrypt-blacklist-domains.txt # +# # +################################################################################## + +# Local additions +file:domains-blacklist-local-additions.txt + +# Bambenek malware C2s +http://osint.bambenekconsulting.com/feeds/c2-dommasterlist.txt + +# hpHosts’ Ad and tracking servers +http://hosts-file.net/.%5Cad_servers.txt + +# Malware domains +http://mirror1.malwaredomains.com/files/justdomains + +# Abuse.ch Ransomware Tracker +http://ransomwaretracker.abuse.ch/downloads/RW_DOMBL.txt + +# Malware Domain List +http://www.malwaredomainlist.com/mdlcsv.php?inactive=off + +# Adblock Warning Removal List +https://easylist-downloads.adblockplus.org/antiadblockfilters.txt + +# EasyList +https://easylist-downloads.adblockplus.org/easylist_noelemhide.txt + +# EasyList China +https://easylist-downloads.adblockplus.org/easylistchina.txt + +# Fanboy’s Social Blocking List +https://easylist-downloads.adblockplus.org/fanboy-social.txt + +# Peter Lowe’s Ad and tracking server list +https://pgl.yoyo.org/adservers/serverlist.php + +# Spam404 +https://raw.githubusercontent.com/Dawsey21/Lists/master/adblock-list.txt + +# CJX Annoyance List +https://raw.githubusercontent.com/cjx82630/cjxlist/master/cjxlist.txt + +# EU: Prebake - Filter Obtrusive Cookie Notices +https://raw.githubusercontent.com/liamja/Prebake/master/obtrusive.txt + +# Malvertising filter list by Disconnect +https://s3.amazonaws.com/lists.disconnect.me/simple_malvertising.txt + +# Malware filter list by Disconnect +https://s3.amazonaws.com/lists.disconnect.me/simple_malware.txt + +# Basic tracking list by Disconnect +https://s3.amazonaws.com/lists.disconnect.me/simple_tracking.txt + +# Sysctl list (ads) +http://sysctl.org/cameleon/hosts + +# KAD host file (fraud/adware) - https://github.com/azet12/KADhosts +https://raw.githubusercontent.com/azet12/KADhosts/master/KADhosts.txt + +# BarbBlock list (spurious and invalid DMCA takedowns) +https://ssl.bblck.me/blacklists/domain-list.txt + +# Dan Pollock's hosts list +http://someonewhocares.org/hosts/hosts + +# Websites potentially publishing fake news +# https://raw.githubusercontent.com/marktron/fakenews/master/fakenews + +# Quidsup NoTrack - Contains too many false positives to be enabled by default +# https://raw.githubusercontent.com/quidsup/notrack/master/trackers.txt + +# Dynamic DNS services, sadly often used by malware +# http://mirror2.malwaredomains.com/files/dynamic_dns.txt + +# Block pornography +# https://raw.githubusercontent.com/Clefspeare13/pornhosts/master/0.0.0.0/hosts +# https://raw.githubusercontent.com/Sinfonietta/hostfiles/master/pornography-hosts +# http://securemecca.com/Downloads/hosts.txt + +# Block gambling sites +# https://raw.githubusercontent.com/Sinfonietta/hostfiles/master/gambling-hosts + +# Block social media sites +# https://raw.githubusercontent.com/Sinfonietta/hostfiles/master/social-hosts diff --git a/utils/generate-domains-blacklists/domains-whitelist.txt b/utils/generate-domains-blacklists/domains-whitelist.txt new file mode 100644 index 00000000..40453988 --- /dev/null +++ b/utils/generate-domains-blacklists/domains-whitelist.txt @@ -0,0 +1,23 @@ +a-msedge.net +amazon.com +appsflyer.com +azurewebsites.net +cdnetworks.com +cloudapp.net +edgekey.net +elasticbeanstalk.com +invalid +j.mp +l-msedge.net +lan +localdomain +microsoft.com +msedge.net +nsatc.net +ovh.net +pusher.com +pusherapp.com +spotify.com +tagcommander.com +tracker.debian.org +windows.net diff --git a/utils/generate-domains-blacklists/generate-domains-blacklist.py b/utils/generate-domains-blacklists/generate-domains-blacklist.py new file mode 100755 index 00000000..772b8849 --- /dev/null +++ b/utils/generate-domains-blacklists/generate-domains-blacklist.py @@ -0,0 +1,140 @@ +#! /usr/bin/env python + +# run with python generate-domains-blacklist.py > list.txt.tmp && mv -f list.txt.tmp list + +import argparse +import re +import sys +import urllib2 + + +def parse_blacklist(content, trusted=False): + rx_comment = re.compile(r'^(#|$)') + rx_inline_comment = re.compile(r'\s*#\s*[a-z0-9-].*$') + rx_u = re.compile(r'^@*\|\|([a-z0-9.-]+[.][a-z]{2,})\^?(\$(popup|third-party))?$') + rx_l = re.compile(r'^([a-z0-9.-]+[.][a-z]{2,})$') + rx_h = re.compile(r'^[0-9]{1,3}[.][0-9]{1,3}[.][0-9]{1,3}[.][0-9]{1,3}\s+([a-z0-9.-]+[.][a-z]{2,})$') + rx_mdl = re.compile(r'^"[^"]+","([a-z0-9.-]+[.][a-z]{2,})",') + rx_b = re.compile(r'^([a-z0-9.-]+[.][a-z]{2,}),.+,[0-9: /-]+,') + rx_trusted = re.compile(r'^([*a-z0-9.-]+)$') + + names = set() + rx_set = [rx_u, rx_l, rx_h, rx_mdl, rx_b] + if trusted: + rx_set = [rx_trusted] + for line in content.splitlines(): + line = str.lower(str.strip(line)) + if rx_comment.match(line): + continue + line = rx_inline_comment.sub('', line) + for rx in rx_set: + matches = rx.match(line) + if not matches: + continue + name = matches.group(1) + names.add(name) + return names + + +def list_from_url(url): + sys.stderr.write("Loading data from [{}]\n".format(url)) + req = urllib2.Request(url) + trusted = False + if req.get_type() == "file": + trusted = True + response = None + try: + response = urllib2.urlopen(req, timeout=10) + except urllib2.URLError as err: + raise Exception("[{}] could not be loaded: {}\n".format(url, err)) + if trusted is False and response.getcode() != 200: + raise Exception("[{}] returned HTTP code {}\n".format(url, response.getcode())) + content = response.read() + + return parse_blacklist(content, trusted) + + +def name_cmp(name): + parts = name.split(".") + parts.reverse() + return str.join(".", parts) + + +def has_suffix(names, name): + parts = str.split(name, ".") + while parts: + parts = parts[1:] + if str.join(".", parts) in names: + return True + + return False + + +def whitelist_from_url(url): + if not url: + return set() + + return list_from_url(url) + + +def blacklists_from_config_file(file, whitelist, ignore_retrieval_failure): + blacklists = {} + all_names = set() + unique_names = set() + + if whitelist and not re.match(r'^[a-z0-9]+:', whitelist): + whitelist = "file:" + whitelist + + whitelisted_names = whitelist_from_url(whitelist) + + with open(file) as fd: + for line in fd: + line = str.strip(line) + if str.startswith(line, "#") or line == "": + continue + url = line + try: + names = list_from_url(url) + blacklists[url] = names + all_names |= names + except Exception as e: + sys.stderr.write(e.message) + if not ignore_retrieval_failure: + exit(1) + + for url, names in blacklists.items(): + print("\n\n########## Blacklist from {} ##########\n".format(url)) + ignored, whitelisted = 0, 0 + list_names = list() + for name in names: + if has_suffix(all_names, name) or name in unique_names: + ignored = ignored + 1 + elif has_suffix(whitelisted_names, name) or name in whitelisted_names: + whitelisted = whitelisted + 1 + else: + list_names.append(name) + unique_names.add(name) + + list_names.sort(key=name_cmp) + if ignored: + print("# Ignored duplicates: {}\n".format(ignored)) + if whitelisted: + print("# Ignored entries due to the whitelist: {}\n".format(whitelisted)) + for name in list_names: + print(name) + + +argp = argparse.ArgumentParser(description="Create a unified blacklist from a set of local and remote files") +argp.add_argument("-c", "--config", default="domains-blacklist.conf", + help="file containing blacklist sources") +argp.add_argument("-w", "--whitelist", default="domains-whitelist.txt", + help="file containing a set of names to exclude from the blacklist") +argp.add_argument("-i", "--ignore-retrieval-failure", action='store_true', + help="generate list even if some urls couldn't be retrieved") +args = argp.parse_args() + +conf = args.config +whitelist = args.whitelist +ignore_retrieval_failure = args.ignore_retrieval_failure + +blacklists_from_config_file(conf, whitelist, ignore_retrieval_failure)