generate-domains-blocklist: use the same name for the directory and the tool

2025-04-04 05:37:38 +03:00 · 2020-12-18 21:27:33 +01:00 · 2020-12-18 21:27:33 +01:00 · c17461ed42
commit c17461ed42
parent 77f81cc8c2
5 changed files with 2 additions and 0 deletions
--- a/utils/generate-domains-blocklist/domains-allowlist.txt
+++ b/utils/generate-domains-blocklist/domains-allowlist.txt
@ -0,0 +1,76 @@
+163.com
+a-msedge.net
+amazon.com
+app.link
+apple.com
+appsflyer.com
+azurewebsites.net
+baidu.com
+bankofamerica.com
+cdn.cloudflare.net
+cdn.optimizely.com
+cdnetworks.com
+cdninstagram.com
+ce5.at
+cloudapp.net
+demon-networks.com
+dl.360safe.com
+download.dnscrypt.info
+edgekey.net
+edgesuite.net
+elasticbeanstalk.com
+eliteoz.com.au
+fastly.net
+github.com
+github.io
+googleadservices.com
+guiltfreefoodguide.com
+gvt1.com
+gvt2.com
+heritagehighway.com.au
+invalid
+j.mp
+kundencenter.telekom.de
+l-msedge.net
+labarchitects.com
+lan
+liveinternet.ru
+localdomain
+localhost
+login.microsoftonline.com
+login.yahoo.com
+mdidesign.ca
+microsoft.com
+mobiledl.adobe.com
+msedge.net
+msftconnecttest.com
+msftncsi.com
+nsatc.net
+ocsp.apple.com
+onedrive.live.com
+outlook.live.com
+ovh.net
+paypal.com
+pbs.twimg.com
+polyfill.io
+prod.msocdn.com
+pusher.com
+pusherapp.com
+qualtrics.com
+raw.githubusercontent.com
+rcollard.com
+revinate.com
+s.youtube.com
+smtp.mail.yahoo.com
+someonewhocares.org
+spotify.com
+static.parastorage.com
+storage.googleapis.com
+syandus.com
+tagcommander.com
+urldefense.proofpoint.com
+userbenchmark.com
+vk.com
+vlab.pp.ru
+windows.net
+youtu.be
--- a/utils/generate-domains-blocklist/domains-blocklist-local-additions.txt
+++ b/utils/generate-domains-blocklist/domains-blocklist-local-additions.txt
@ -0,0 +1,41 @@
+
+# Local set of patterns to block
+
+ad.*
+ads.*
+banner.*
+banners.*
+creatives.*
+oas.*
+oascentral.*
+tag.*
+telemetry.*
+tracker.*
+
+# My Macbook constantly sends a lot of useless queries to *.local,
+# so I block them. *.lan is apparently another common one, and
+# *.localdomain and *.workgroup are common on Windows.
+
+*.lan
+*.local
+*.localdomain
+*.workgroup
+
+# eth0.me is hardcoded in tools such as Archey, but is not available any
+# more, causing issues such as terminal sessions taking a long time to
+# start.
+
+eth0.me
+
+# ibpxl.com is a tracker that seems to frequently have issues, causing
+# page loads to stall.
+
+ibpxl.com
+
+# ditto for that one
+
+internetbrands.com
+
+# Ubuntu's motd script sends way too much information to Canonical
+
+motd.ubuntu.com
--- a/utils/generate-domains-blocklist/domains-blocklist.conf
+++ b/utils/generate-domains-blocklist/domains-blocklist.conf
@ -0,0 +1,170 @@
+##################################################################################
+#                                                                                #
+#   Generate a block list of domains using public data sources, and the local    #
+#   domains-blocklist-local-additions.txt file.                                  #
+#                                                                                #
+#   The default configuration is just indicative, and corresponds to the one     #
+#   used to produce the public "mybase" set.                                     #
+#                                                                                #
+#   Comment out the URLs of the sources you wish to disable, leave the ones      #
+#   you would like enabled uncommented.  Then run the script to build the        #
+#   dnscrypt-blocklist-domains.txt file:                                         #
+#                                                                                #
+#   $  generate-domains-blocklist.py > dnscrypt-blacklist-domains.txt            #
+#                                                                                #
+#   Domains that should never be blocked can be put into a file named            #
+#   domains-allowlist.txt.                                                       #
+#                                                                                #
+#   That blocklist file can then be used in the dnscrypt-proxy.toml file:        #
+#                                                                                #
+#   [blocklist]                                                                  #
+#                                                                                #
+#   blocklist_file = 'dnscrypt-blocklist-domains.txt'                            #
+#                                                                                #
+##################################################################################
+
+# Local additions
+file:domains-blocklist-local-additions.txt
+
+# AdAway is an open source ad blocker for Android using the hosts file.
+# https://raw.githubusercontent.com/AdAway/adaway.github.io/master/hosts.txt
+
+# Malware domains
+https://mirror1.malwaredomains.com/files/justdomains
+
+# Malware Domain List
+https://www.malwaredomainlist.com/hostslist/hosts.txt
+
+# EasyList
+# https://easylist-downloads.adblockplus.org/easylist_noelemhide.txt
+
+# EasyList China
+# https://easylist-downloads.adblockplus.org/easylistchina.txt
+
+# RU AdList
+# https://easylist-downloads.adblockplus.org/advblock.txt
+
+# Peter Lowe's Ad and tracking server list
+https://pgl.yoyo.org/adservers/serverlist.php?hostformat=nohtml
+
+# Spam404
+# https://raw.githubusercontent.com/Spam404/lists/master/main-blacklist.txt
+
+# Malvertising filter list by Disconnect
+# https://s3.amazonaws.com/lists.disconnect.me/simple_malvertising.txt
+
+# Ads filter list by Disconnect
+# https://s3.amazonaws.com/lists.disconnect.me/simple_ad.txt
+
+# Basic tracking list by Disconnect
+# https://s3.amazonaws.com/lists.disconnect.me/simple_tracking.txt
+
+# Sysctl list (ads)
+# http://sysctl.org/cameleon/hosts
+
+# KAD host file (fraud/adware) without controversies
+# https://raw.githubusercontent.com/PolishFiltersTeam/KADhosts/master/KADhosts_without_controversies.txt
+
+# BarbBlock list (spurious and invalid DMCA takedowns)
+https://paulgb.github.io/BarbBlock/blacklists/domain-list.txt
+
+# Dan Pollock's hosts list
+# https://someonewhocares.org/hosts/hosts
+
+# NoTracking's list - blocking ads, trackers and other online garbage
+https://raw.githubusercontent.com/notracking/hosts-blocklists/master/dnscrypt-proxy/dnscrypt-proxy.blacklist.txt
+
+# NextDNS CNAME cloaking list
+https://raw.githubusercontent.com/nextdns/cname-cloaking-blocklist/master/domains
+
+# AdGuard Simplified Domain Names filter
+# https://adguardteam.github.io/AdGuardSDNSFilter/Filters/filter.txt
+
+# Geoffrey Frogeye's block list of first-party trackers - https://hostfiles.frogeye.fr/
+https://hostfiles.frogeye.fr/firstparty-trackers.txt
+
+# CoinBlockerLists: blocks websites serving cryptocurrency miners - https://gitlab.com/ZeroDot1/CoinBlockerLists/ - Contains false positives
+# https://gitlab.com/ZeroDot1/CoinBlockerLists/raw/master/list_browser.txt
+
+# Websites potentially publishing fake news
+# https://raw.githubusercontent.com/marktron/fakenews/master/fakenews
+
+# Quidsup NoTrack Blocklist - Contains too many false positives to be enabled by default
+# https://gitlab.com/quidsup/notrack-blocklists/raw/master/notrack-blocklist.txt
+
+# Quidsup Malware Blocklist - Contains too many false positives to be enabled by default
+# https://gitlab.com/quidsup/notrack-blocklists/raw/master/notrack-malware.txt
+
+# AntiSocial Blacklist is an extensive collection of potentially malicious domains
+# https://theantisocialengineer.com/AntiSocial_Blacklist_Community_V1.txt
+
+# Steven Black hosts file
+# https://raw.githubusercontent.com/StevenBlack/hosts/master/hosts
+
+# A list of adserving and tracking sites maintained by @anudeepND
+# https://raw.githubusercontent.com/anudeepND/blacklist/master/adservers.txt
+
+# Anudeep's Blacklist (CoinMiner) - Blocks cryptojacking sites
+# https://raw.githubusercontent.com/anudeepND/blacklist/master/CoinMiner.txt
+
+# Block Spotify ads
+# https://gitlab.com/CHEF-KOCH/cks-filterlist/-/raw/master/Anti-Corp/Spotify/Spotify-HOSTS.txt
+
+### Spark < Blu Go < Blu < Basic < Ultimate
+### (With pornware blocking) Porn < Unified
+# Energized Ultimate
+# https://block.energized.pro/ultimate/formats/domains.txt
+
+# Energized Basic
+# https://block.energized.pro/basic/formats/domains.txt
+
+# Energized BLU
+# https://block.energized.pro/blu/formats/domains.txt
+
+# OISD.NL - Blocks ads, phishing, malware, tracking and more. Tries to minimize false positives.
+https://dbl.oisd.nl/
+
+# OISD.NL (smaller subset) - Blocks ads, phishing, malware, tracking and more.
+# https://hosts.oisd.nl/light
+
+# Captain Miao ad list - Block ads and trackers, especially Chinese and Android trackers
+# https://raw.githubusercontent.com/jdlingyu/ad-wars/master/sha_ad_hosts
+
+# Dynamic DNS services, sadly often used by malware
+# https://mirror1.malwaredomains.com/files/dynamic_dns.txt
+
+# Phishing Army - https://phishing.army/
+# https://phishing.army/download/phishing_army_blocklist.txt
+
+# Block pornography
+# https://raw.githubusercontent.com/Clefspeare13/pornhosts/master/0.0.0.0/hosts
+# https://raw.githubusercontent.com/Sinfonietta/hostfiles/master/pornography-hosts
+# https://raw.githubusercontent.com/cbuijs/shallalist/master/porn/domains
+# https://raw.githubusercontent.com/olbat/ut1-blacklists/master/blacklists/adult/domains
+# https://block.energized.pro/porn/formats/domains.txt
+# https://raw.githubusercontent.com/mhxion/pornaway/master/hosts/porn_sites.txt
+
+# Block gambling sites
+# https://raw.githubusercontent.com/Sinfonietta/hostfiles/master/gambling-hosts
+# https://raw.githubusercontent.com/olbat/ut1-blacklists/master/blacklists/gambling/domains
+
+# Block dating websites
+# https://raw.githubusercontent.com/olbat/ut1-blacklists/master/blacklists/dating/domains
+
+# Block social media sites
+# https://raw.githubusercontent.com/Sinfonietta/hostfiles/master/social-hosts
+# https://block.energized.pro/extensions/social/formats/domains.txt
+# https://raw.githubusercontent.com/olbat/ut1-blacklists/master/blacklists/social_networks/domains
+
+# Goodbye Ads - Specially designed for mobile ad protection
+# https://raw.githubusercontent.com/jerryn70/GoodbyeAds/master/Hosts/GoodbyeAds.txt
+
+# NextDNS BitTorrent blocklist
+# https://raw.githubusercontent.com/nextdns/bittorrent-blocklist/master/domains
+
+# Block spying and tracking on Windows 
+# https://raw.githubusercontent.com/crazy-max/WindowsSpyBlocker/master/data/dnscrypt/spy.txt
+
+# GameIndustry.eu - Block spyware, advertising, analytics, tracking in games and associated clients
+# https://www.gameindustry.eu/files/hosts.txt
+
--- a/utils/generate-domains-blocklist/domains-time-restricted.txt
+++ b/utils/generate-domains-blocklist/domains-time-restricted.txt
@ -0,0 +1,10 @@
+## Rules to be applied at specific times
+##
+## This requires a time schedule to be defined in the
+## dnscrypt-proxy.toml configuration file.
+##
+## This file must be loaded using the --time-restricted command-line flag of the generate-domains-blocklist.py utility.
+
+# twitter.com       @work
+# facebook.com      @work
+# *.youtube.*       @time-to-sleep
--- a/utils/generate-domains-blocklist/generate-domains-blocklist.py
+++ b/utils/generate-domains-blocklist/generate-domains-blocklist.py
@ -0,0 +1,337 @@
+#! /usr/bin/env python3
+
+# run with python generate-domains-blocklist.py > list.txt.tmp && mv -f list.txt.tmp list
+
+from __future__ import print_function
+
+import argparse
+import re
+import sys
+import fnmatch
+
+try:
+    import urllib2 as urllib
+
+    URLLIB_NEW = False
+except (ImportError, ModuleNotFoundError):
+    import urllib.request as urllib
+    from urllib.request import Request
+
+    URLLIB_NEW = True
+
+
+def parse_trusted_list(content):
+    rx_comment = re.compile(r"^(#|$)")
+    rx_inline_comment = re.compile(r"\s*#\s*[a-z0-9-].*$")
+    rx_trusted = re.compile(r"^([*a-z0-9.-]+)\s*(@\S+)?$")
+
+    names = set()
+    time_restrictions = {}
+    globs = set()
+    rx_set = [rx_trusted]
+    for line in content.splitlines():
+        line = str.lower(str.strip(line))
+        if rx_comment.match(line):
+            continue
+        line = str.strip(rx_inline_comment.sub("", line))
+        if is_glob(line):
+            globs.add(line)
+            names.add(line)
+            continue
+        for rx in rx_set:
+            matches = rx.match(line)
+            if not matches:
+                continue
+            name = matches.group(1)
+            names.add(name)
+            time_restriction = matches.group(2)
+            if time_restriction:
+                time_restrictions[name] = time_restriction
+    return names, time_restrictions, globs
+
+
+def parse_list(content, trusted=False):
+    if trusted:
+        return parse_trusted_list(content)
+
+    rx_comment = re.compile(r"^(#|$)")
+    rx_inline_comment = re.compile(r"\s*#\s*[a-z0-9-].*$")
+    rx_u = re.compile(
+        r"^@*\|\|([a-z0-9][a-z0-9.-]*[.][a-z]{2,})\^?(\$(popup|third-party))?$"
+    )
+    rx_l = re.compile(r"^([a-z0-9][a-z0-9.-]*[.][a-z]{2,})$")
+    rx_h = re.compile(
+        r"^[0-9]{1,3}[.][0-9]{1,3}[.][0-9]{1,3}[.][0-9]{1,3}\s+([a-z0-9][a-z0-9.-]*[.][a-z]{2,})$"
+    )
+    rx_mdl = re.compile(r'^"[^"]+","([a-z0-9][a-z0-9.-]*[.][a-z]{2,})",')
+    rx_b = re.compile(r"^([a-z0-9][a-z0-9.-]*[.][a-z]{2,}),.+,[0-9: /-]+,")
+    rx_dq = re.compile(r"^address=/([a-z0-9][a-z0-9.-]*[.][a-z]{2,})/.")
+
+    names = set()
+    time_restrictions = {}
+    globs = set()
+    rx_set = [rx_u, rx_l, rx_h, rx_mdl, rx_b, rx_dq]
+    for line in content.splitlines():
+        line = str.lower(str.strip(line))
+        if rx_comment.match(line):
+            continue
+        line = str.strip(rx_inline_comment.sub("", line))
+        for rx in rx_set:
+            matches = rx.match(line)
+            if not matches:
+                continue
+            name = matches.group(1)
+            names.add(name)
+    return names, time_restrictions, globs
+
+
+def print_restricted_name(output_fd, name, time_restrictions):
+    if name in time_restrictions:
+        print("{}\t{}".format(name, time_restrictions[name]), file=output_fd, end="\n")
+    else:
+        print(
+            "# ignored: [{}] was in the time-restricted list, "
+            "but without a time restriction label".format(name),
+            file=output_fd,
+            end="\n",
+        )
+
+
+def load_from_url(url):
+    sys.stderr.write("Loading data from [{}]\n".format(url))
+    req = urllib.Request(url=url, headers={"User-Agent": "dnscrypt-proxy"})
+    trusted = False
+
+    if URLLIB_NEW:
+        req_type = req.type
+    else:
+        req_type = req.get_type()
+    if req_type == "file":
+        trusted = True
+
+    response = None
+    try:
+        response = urllib.urlopen(req, timeout=int(args.timeout))
+    except urllib.URLError as err:
+        raise Exception("[{}] could not be loaded: {}\n".format(url, err))
+    if trusted is False and response.getcode() != 200:
+        raise Exception("[{}] returned HTTP code {}\n".format(url, response.getcode()))
+    content = response.read()
+    if URLLIB_NEW:
+        content = content.decode("utf-8", errors="replace")
+
+    return content, trusted
+
+
+def name_cmp(name):
+    parts = name.split(".")
+    parts.reverse()
+    return str.join(".", parts)
+
+
+def is_glob(pattern):
+    maybe_glob = False
+    for i in range(len(pattern)):
+        c = pattern[i]
+        if c == "?" or c == "[":
+            maybe_glob = True
+        elif c == "*" and i != 0:
+            if i < len(pattern) - 1 or pattern[i - 1] == ".":
+                maybe_glob = True
+    if maybe_glob:
+        try:
+            fnmatch.fnmatch("example", pattern)
+            return True
+        except:
+            pass
+    return False
+
+
+def covered_by_glob(globs, name):
+    if name in globs:
+        return False
+    for glob in globs:
+        try:
+            if fnmatch.fnmatch(name, glob):
+                return True
+        except:
+            pass
+    return False
+
+
+def has_suffix(names, name):
+    parts = str.split(name, ".")
+    while parts:
+        parts = parts[1:]
+        if str.join(".", parts) in names:
+            return True
+
+    return False
+
+
+def allowlist_from_url(url):
+    if not url:
+        return set()
+    content, trusted = load_from_url(url)
+
+    names, _time_restrictions, _globs = parse_list(content, trusted)
+    return names
+
+
+def blocklists_from_config_file(
+    file, allowlist, time_restricted_url, ignore_retrieval_failure, output_file
+):
+    blocklists = {}
+    allowed_names = set()
+    all_names = set()
+    unique_names = set()
+    all_globs = set()
+
+    # Load conf & blocklists
+    with open(file) as fd:
+        for line in fd:
+            line = str.strip(line)
+            if str.startswith(line, "#") or line == "":
+                continue
+            url = line
+            try:
+                content, trusted = load_from_url(url)
+                names, _time_restrictions, globs = parse_list(content, trusted)
+                blocklists[url] = names
+                all_names |= names
+                all_globs |= globs
+            except Exception as e:
+                sys.stderr.write(str(e))
+                if not ignore_retrieval_failure:
+                    exit(1)
+
+    # Time-based blocklist
+    if time_restricted_url and not re.match(r"^[a-z0-9]+:", time_restricted_url):
+        time_restricted_url = "file:" + time_restricted_url
+
+    output_fd = sys.stdout
+    if output_file:
+        output_fd = open(output_file, "w")
+
+    if time_restricted_url:
+        time_restricted_content, _trusted = load_from_url(time_restricted_url)
+        time_restricted_names, time_restrictions, _globs = parse_trusted_list(
+            time_restricted_content
+        )
+
+        if time_restricted_names:
+            print(
+                "########## Time-based blocklist ##########\n", file=output_fd, end="\n"
+            )
+            for name in time_restricted_names:
+                print_restricted_name(output_fd, name, time_restrictions)
+
+        # Time restricted names should be allowed, or they could be always blocked
+        allowed_names |= time_restricted_names
+
+    # Allowed list
+    if allowlist and not re.match(r"^[a-z0-9]+:", allowlist):
+        allowlist = "file:" + allowlist
+
+    allowed_names |= allowlist_from_url(allowlist)
+
+    # Process blocklists
+    for url, names in blocklists.items():
+        print(
+            "\n\n########## Blocklist from {} ##########\n".format(url),
+            file=output_fd,
+            end="\n",
+        )
+        ignored, glob_ignored, allowed = 0, 0, 0
+        list_names = list()
+        for name in names:
+            if covered_by_glob(all_globs, name):
+                glob_ignored = glob_ignored + 1
+            elif has_suffix(all_names, name) or name in unique_names:
+                ignored = ignored + 1
+            elif has_suffix(allowed_names, name) or name in allowed_names:
+                allowed = allowed + 1
+            else:
+                list_names.append(name)
+                unique_names.add(name)
+
+        list_names.sort(key=name_cmp)
+        if ignored:
+            print("# Ignored duplicates: {}".format(ignored), file=output_fd, end="\n")
+        if glob_ignored:
+            print(
+                "# Ignored due to overlapping local patterns: {}".format(glob_ignored),
+                file=output_fd,
+                end="\n",
+            )
+        if allowed:
+            print(
+                "# Ignored entries due to the allowlist: {}".format(allowed),
+                file=output_fd,
+                end="\n",
+            )
+        if ignored or glob_ignored or allowed:
+            print(file=output_fd, end="\n")
+        for name in list_names:
+            print(name, file=output_fd, end="\n")
+
+    output_fd.close()
+
+
+argp = argparse.ArgumentParser(
+    description="Create a unified blocklist from a set of local and remote files"
+)
+argp.add_argument(
+    "-c",
+    "--config",
+    default="domains-blocklist.conf",
+    help="file containing blocklist sources",
+)
+argp.add_argument(
+    "-w",
+    "--whitelist",
+    help=argparse.SUPPRESS,
+)
+argp.add_argument(
+    "-a",
+    "--allowlist",
+    default="domains-allowlist.txt",
+    help="file containing a set of names to exclude from the blocklist",
+)
+argp.add_argument(
+    "-r",
+    "--time-restricted",
+    default="domains-time-restricted.txt",
+    help="file containing a set of names to be time restricted",
+)
+argp.add_argument(
+    "-i",
+    "--ignore-retrieval-failure",
+    action="store_true",
+    help="generate list even if some urls couldn't be retrieved",
+)
+argp.add_argument(
+    "-o",
+    "--output-file",
+    default=None,
+    help="save generated blocklist to a text file with the provided file name",
+)
+argp.add_argument("-t", "--timeout", default=30, help="URL open timeout")
+
+args = argp.parse_args()
+
+whitelist = args.whitelist
+if whitelist:
+    print("The option to provide a set of names to exclude from the blocklist has been changed from -w to -a\r\n")
+    argp.print_help()
+    exit(1)
+
+conf = args.config
+allowlist = args.allowlist
+time_restricted = args.time_restricted
+ignore_retrieval_failure = args.ignore_retrieval_failure
+output_file = args.output_file
+
+blocklists_from_config_file(
+    conf, allowlist, time_restricted, ignore_retrieval_failure, output_file
+)