Add generic SOCKS proxy support, variable renaming

Now you can run your workers through a SOCKS proxy with the -S flag!
Authentication at the command-line through fully-qualified URLs as well
as interactive authentication are now supported.

I may revert the try/except block in LinodeProxy's constructor should it
be found to be non-advantageous in further testing.  Performance in
creating large quantities of Linode proxies is still dog slow, may use
concurrent.futures to speed this up in the future (hah) -- this will
require a major rework of the downloader main().
This commit is contained in:
Andrea Rogers 2021-10-19 23:34:30 -04:00
commit cf2a50ba93
5 changed files with 332 additions and 59 deletions

View file

@ -13,7 +13,8 @@
</p> </p>
`squid-dl` is a massively parallel `squid-dl` is a massively parallel
[yt-dlp](https://github.com/yt-dlp/yt-dlp)-based YouTube downloader. [yt-dlp](https://github.com/yt-dlp/yt-dlp)-based YouTube downloader useful for
downloading large video playlists a fast internet connection.
## Installation ## Installation
Run the `setup.py`, which will install `squid-dl` and its two dependencies: Run the `setup.py`, which will install `squid-dl` and its two dependencies:
@ -22,7 +23,7 @@ Run the `setup.py`, which will install `squid-dl` and its two dependencies:
$ python3 setup.py install $ python3 setup.py install
``` ```
### Linode Setup ### Linode Proxy Setup
If you want to use the Linode SOCKS proxy feature, be sure to configure the If you want to use the Linode SOCKS proxy feature, be sure to configure the
`linode-cli` first: `linode-cli` first:
``` ```
@ -87,7 +88,43 @@ playlist example, we'll spawn 12 workers:
For more information see the built-in help by running `squid-dl -h`. For more information see the built-in help by running `squid-dl -h`.
## Linode Proxying ### SOCKS Proxying
For those with access to a dedicated SOCKS proxy already, you can use
`squid-dl`'s `-S` and a fully-qualified SOCKS4, SOCKS4A, or SOCKS5 proxy URL
to download your playlists through a proxy! Here's an example using NordVPN's
SOCKS5 proxy:
```
(.venv) $ squid-dl -S socks5://us.socks.nordhold.net:1080 -n 12 Mems.json
[INFO]: Starting squid-dl...
[INFO]: saving videos to "Mems" directory
SOCKS username: 0asFVrZt0bw1ucPvQRKiUe87
SOCKS password:
...
[download] Download completed
[INFO]: Worker 1667326 done...
[INFO]: Worker 1667396 done...
[INFO]: Worker 1667484 done...
[download] Download completed
[download] Download completed
[INFO]: Worker 1667352 done...
[INFO]: Worker 1667421 done...
[INFO]: All done!
```
You can also add a username and password to SOCKS5 and SOCKS4A proxy URLs in a
format like this:
```
socks5://username:password@hostname:port
```
**SECURITY NOTE:** typing in usernames and passwords this way is considered
insecure, as they will likely end up in your shell's history file completely
unprotected and in the clear (☹). It is generally recommended to input the
username and password interactively unless you are scripting `squid-dl`.
### Linode Proxying
With the `-L` option, you can run each worker through its own Linode-powered With the `-L` option, you can run each worker through its own Linode-powered
SSH-tunneled SOCKSv5 proxy! `squid-dl` will make an temporary SSH key in SSH-tunneled SOCKSv5 proxy! `squid-dl` will make an temporary SSH key in
the current working directory and then get to work spinning up Linodes and the current working directory and then get to work spinning up Linodes and

View file

@ -42,6 +42,8 @@ from yt_dlp.utils import encodeFilename, sanitize_path
from yt_dlp.extractor.common import InfoExtractor as IE from yt_dlp.extractor.common import InfoExtractor as IE
from .linode import LinodeProxy from .linode import LinodeProxy
from .proxy import Proxy
from .socks import SocksProxy
from .util import die, eprint, runcmd from .util import die, eprint, runcmd
@ -49,7 +51,7 @@ def do_download(
entry_q: Queue, entry_q: Queue,
opts: argparse.Namespace, opts: argparse.Namespace,
sub_langs: [str], sub_langs: [str],
proxy: LinodeProxy = None, proxy: Proxy = None,
): ):
sub_opts = { sub_opts = {
@ -330,7 +332,7 @@ def resume_preprocess(entries: [dict]) -> list:
return unfinished_entries return unfinished_entries
def validate_proxy(proxy: LinodeProxy) -> LinodeProxy: def validate_linode_proxy(proxy: LinodeProxy) -> LinodeProxy:
if not proxy.start(): if not proxy.start():
eprint( eprint(
"[WARN]: " "[WARN]: "
@ -339,7 +341,7 @@ def validate_proxy(proxy: LinodeProxy) -> LinodeProxy:
port = proxy.proxy_port port = proxy.proxy_port
proxy.cleanup() proxy.cleanup()
proxy = LinodeProxy(proxy_port=port) proxy = LinodeProxy(proxy_port=port)
return validate_proxy(proxy) return validate_linode_proxy(proxy)
else: else:
print( print(
"[INFO]: SOCKS validation succeeded on port {} from ID {}".format( "[INFO]: SOCKS validation succeeded on port {} from ID {}".format(
@ -349,7 +351,7 @@ def validate_proxy(proxy: LinodeProxy) -> LinodeProxy:
return proxy return proxy
def cleanup(workers: [Process], proxies: [LinodeProxy]) -> None: def cleanup(workers: [Process], linode_proxies: [LinodeProxy]) -> None:
if len(workers) > 0: if len(workers) > 0:
for worker in workers: for worker in workers:
if worker.is_alive(): if worker.is_alive():
@ -360,16 +362,16 @@ def cleanup(workers: [Process], proxies: [LinodeProxy]) -> None:
) )
worker.terminate() worker.terminate()
if len(proxies) > 0: if len(linode_proxies) > 0:
print("[CLEANUP]: Deleting Linode proxies...") print("[CLEANUP]: Deleting Linode proxies...")
for proxy in proxies: for proxy in linode_proxies:
proxy.cleanup() proxy.cleanup()
def parse_args(args: list, name: str): def parse_args(args: list, name: str):
parser = argparse.ArgumentParser(prog=name) parser = argparse.ArgumentParser(prog=name)
group = parser.add_argument_group("Proxy settings") group = parser.add_mutually_exclusive_group(required=False)
group.add_argument( group.add_argument(
"-L", "-L",
"--linode-proxy", "--linode-proxy",
@ -380,12 +382,26 @@ def parse_args(args: list, name: str):
+ "for more information.", + "for more information.",
) )
group.add_argument( group.add_argument(
"-S",
"--socks-proxy",
type=str,
default=None,
help="Run workers through a SOCKS proxy. Requires a fully-qualified "
+ 'proxy URL (e.g. "socks5://user:pass@hostname:port" or '
+ '"socks5://hostname:port").\n'
+ "Be mindful of your shell's history file when entering passwords on "
+ "the command line. If this script encounters a proxy that requires "
+ "authentication, it will prompt the user for a password "
+ "interactively, as well.",
)
parser.add_argument(
"-p", "-p",
"--proxy-base-port", "--proxy-base-port",
type=int, type=int,
default=1337, default=1337,
help="Port number proxy ports are derived from, does nothing without " help="Port number that local Linode-powered proxy ports are derived "
"enabling a type of proxy (like --linode-proxy).", + "from, does nothing without "
+ "enabling --linode-proxy (aka. -L).",
) )
parser.add_argument( parser.add_argument(
"--resume-dump", "--resume-dump",
@ -446,9 +462,7 @@ def main(args: [str], name: str) -> int:
print('[INFO]: saving videos to "{}" directory'.format(dirname)) print('[INFO]: saving videos to "{}" directory'.format(dirname))
if not (os.path.exists(dirname) and os.path.isdir(dirname)): if not (os.path.exists(dirname) and os.path.isdir(dirname)):
os.mkdir(dirname) os.mkdir(dirname)
os.chdir(dirname)
else: else:
os.chdir(dirname)
playlist_size = len(info_dict["entries"]) playlist_size = len(info_dict["entries"])
info_dict["entries"] = resume_preprocess(info_dict["entries"]) info_dict["entries"] = resume_preprocess(info_dict["entries"])
@ -462,7 +476,7 @@ def main(args: [str], name: str) -> int:
) )
) )
if opts.resume_dump: if opts.resume_dump:
rdump = open("resume.json", mode="w") rdump = open(info_dict["title"] + ".resume.json", mode="w")
rdump.write(j.dumps(info_dict, sort_keys=True, indent=2)) rdump.write(j.dumps(info_dict, sort_keys=True, indent=2))
rdump.close() rdump.close()
@ -476,16 +490,20 @@ def main(args: [str], name: str) -> int:
base_port = 1337 base_port = 1337
workers = [] workers = []
proxies = [] linode_proxies = []
if opts.socks_proxy is not None:
socks_proxy = SocksProxy(url=opts.socks_proxy)
try: try:
for n in range(n_workers): for n in range(n_workers):
port = base_port + n port = base_port + n
if opts.linode_proxy: if opts.linode_proxy:
proxies.append( linode_proxies.append(
LinodeProxy(proxy_port=port, pubkey_path=pubkey_path) LinodeProxy(proxy_port=port, pubkey_path=pubkey_path)
) )
worker_args = (entry_q, opts, sub_langs, proxies[n]) worker_args = (entry_q, opts, sub_langs, linode_proxies[n])
elif opts.socks_proxy is not None:
worker_args = (entry_q, opts, sub_langs, socks_proxy)
else: else:
worker_args = (entry_q, opts, sub_langs) worker_args = (entry_q, opts, sub_langs)
@ -496,7 +514,7 @@ def main(args: [str], name: str) -> int:
) )
) )
if len(proxies) > 0: if len(linode_proxies) > 0:
if not ( if not (
os.path.isfile(pubkey_path) os.path.isfile(pubkey_path)
or os.path.isfile(os.path.splitext(pubkey_path)[0]) or os.path.isfile(os.path.splitext(pubkey_path)[0])
@ -512,7 +530,7 @@ def main(args: [str], name: str) -> int:
print(".", end="") print(".", end="")
temp_list = [] temp_list = []
for proxy_idx in nodes_to_ping: for proxy_idx in nodes_to_ping:
if proxies[proxy_idx].get_status() != "running": if linode_proxies[proxy_idx].get_status() != "running":
temp_list.append(proxy_idx) temp_list.append(proxy_idx)
sleep(0.2) sleep(0.2)
nodes_to_ping = temp_list nodes_to_ping = temp_list
@ -521,9 +539,11 @@ def main(args: [str], name: str) -> int:
while not entry_q.full(): while not entry_q.full():
sleep(0.2) sleep(0.2)
os.chdir(dirname)
for i in range(n_workers): for i in range(n_workers):
if len(proxies) > 0: if len(linode_proxies) > 0:
proxies[i] = validate_proxy(proxies[i]) linode_proxies[i] = validate_linode_proxy(linode_proxies[i])
seconds = randint(0, 1) seconds = randint(0, 1)
else: else:
seconds = randint(1, 6) seconds = randint(1, 6)
@ -535,7 +555,7 @@ def main(args: [str], name: str) -> int:
except KeyboardInterrupt: except KeyboardInterrupt:
eprint("\n[CLEANUP]: Interrupted, cleaning up...") eprint("\n[CLEANUP]: Interrupted, cleaning up...")
cleanup(workers, proxies) cleanup(workers, linode_proxies)
if entry_getter.is_alive(): if entry_getter.is_alive():
print( print(
"[CLEANUP]: Terminating queue worker {}".format( "[CLEANUP]: Terminating queue worker {}".format(
@ -546,6 +566,6 @@ def main(args: [str], name: str) -> int:
return 1 return 1
print("[INFO]: All done!") print("[INFO]: All done!")
cleanup(workers, proxies) cleanup(workers, linode_proxies)
return 0 return 0

View file

@ -25,10 +25,11 @@ import subprocess
from time import sleep from time import sleep
import typing import typing
from .proxy import Proxy
from .util import eprint, runcmd from .util import eprint, runcmd
class LinodeProxy: class LinodeProxy(Proxy):
user_made = False user_made = False
def __init__( def __init__(
@ -39,6 +40,7 @@ class LinodeProxy:
debug: bool = False, debug: bool = False,
exclusive: bool = True, exclusive: bool = True,
): ):
try:
self.proxy_port = proxy_port self.proxy_port = proxy_port
self.proxy_user = proxy_user self.proxy_user = proxy_user
self.pubkey_path = pubkey_path self.pubkey_path = pubkey_path
@ -75,6 +77,8 @@ class LinodeProxy:
) )
self.info = j.loads(runcmd(create_cmd).decode())[0] self.info = j.loads(runcmd(create_cmd).decode())[0]
print("[INFO]: Created Linode {}.".format(self.info["id"])) print("[INFO]: Created Linode {}.".format(self.info["id"]))
except KeyboardInterrupt:
self.cleanup()
def find_linode(self) -> bool: def find_linode(self) -> bool:
linodes = j.loads(runcmd("linode-cli --json linodes list").decode()) linodes = j.loads(runcmd("linode-cli --json linodes list").decode())
@ -161,6 +165,8 @@ class LinodeProxy:
def test_proxy(self) -> bool: def test_proxy(self) -> bool:
sen = struct.pack("BBB", 0x05, 0x01, 0x00) sen = struct.pack("BBB", 0x05, 0x01, 0x00)
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.settimeout(2)
try: try:
s.connect(("127.0.0.1", self.proxy_port)) s.connect(("127.0.0.1", self.proxy_port))
except ConnectionRefusedError as e: except ConnectionRefusedError as e:
@ -170,8 +176,18 @@ class LinodeProxy:
) )
) )
return False return False
for n in range(3):
s.sendall(sen) s.sendall(sen)
try:
data = s.recv(2) data = s.recv(2)
break
except socket.timeout:
if n == 2:
eprint(
"[ERROR]: Linode SOCKS timed out after three attempts!",
)
return False
version, auth = struct.unpack("BB", data) version, auth = struct.unpack("BB", data)
if version == 5 and auth == 0: if version == 5 and auth == 0:

38
squid_dl/proxy.py Normal file
View file

@ -0,0 +1,38 @@
"""
squid_dl Proxy base class (for typing)
"""
class Proxy:
proxy_url = None
exclusive = False
def cleanup(self) -> None:
pass

162
squid_dl/socks.py Normal file
View file

@ -0,0 +1,162 @@
"""
Generic SOCKS proxy support.
"""
from getpass import getpass
import json as j
import typing
import os
import socket
import struct
from urllib.parse import unquote_plus, urlparse
from yt_dlp.socks import ProxyType, Socks4Error, Socks5Error, sockssocket
from .proxy import Proxy
from .util import die, eprint
class SocksUnknownSchemeError(Exception):
pass
def test_proxy(host: str, port: int):
sen = struct.pack("BBB", 0x05, 0x01, 0x00)
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.settimeout(2)
try:
s.connect((host, port))
except ConnectionRefusedError as e:
die(
'[ERROR]: Got "{}" when connecting to {} on port {}'.format(
e, host, port
)
)
for n in range(3):
s.sendall(sen)
try:
data = s.recv(2)
break
except socket.timeout:
if n == 2:
eprint("[ERROR]: SOCKS proxy timed out after three attempts!")
exit(1)
s.close()
return struct.unpack("BB", data) # (version, auth)
class SocksProxy(Proxy):
"""
The ``params`` dict can either contain the single element ``url`` or:
* ``host`` (string)
* ``port`` (string)
* ``scheme`` (string, optional)
* ``user`` (string, optional)
* ``password`` (string, optional)
"""
def __init__(
self, url: str = None, params: dict = None, debug: bool = False
):
self.debug = debug
if url is not None:
self.init_from_url(url)
else:
self.init_from_params(params)
self.setup()
def get_socks_proxytype(self):
if self.scheme.lower() == "socks5":
return ProxyType.SOCKS5
elif self.scheme.lower() in ("socks", "socks4"):
return ProxyType.SOCKS4
elif self.scheme.lower() == "socks4a":
return ProxyType.SOCKS4A
else:
eprint("[ERROR]: Unknown scheme in proxy URL!")
raise SocksUnknownSchemeError
def init_from_url(self, url: str):
self.proxy_url = url
url_pieces = urlparse(url)
self.scheme = url_pieces.scheme
self.host = url_pieces.hostname
self.port = url_pieces.port or 1080
self.user = url_pieces.username
self.password = url_pieces.password
def init_from_params(self, params: dict):
self.host = params["host"]
self.port = params["port"] if "port" in params else 1080
if "user" in params and "password" in params:
self.user = params["user"]
self.password = params["password"]
authstr = self.user + ":" + self.password + "@"
else:
self.user = None
self.password = None
authstr = ""
self.scheme = (
params["scheme"].lower() if "scheme" in params else "socks5"
)
self.proxy_url = (
self.scheme + "://" + authstr + self.host + ":" + str(self.port)
)
def get_creds(self):
self.username = input("SOCKS username: ")
self.password = getpass(prompt="SOCKS password: ")
self.proxy_url = (
self.scheme
+ "://"
+ self.username
+ ":"
+ self.password
+ "@"
+ self.host
+ ":"
+ str(self.port)
)
def setup(self):
version, auth = test_proxy(host=self.host, port=self.port)
if auth != 0 and (self.user is None or self.password is None):
self.get_creds()
def unquote_if_non_empty(s):
if not s:
return s
return unquote_plus(s)
proxy_args = (
self.get_socks_proxytype(),
self.host,
self.port or 1080,
True, # Remote DNS
unquote_if_non_empty(self.username),
unquote_if_non_empty(self.password),
)
testsock = sockssocket()
testsock.setproxy(*proxy_args)
try:
testsock.connect((self.host, self.port))
testsock.close()
except (Socks4Error, Socks5Error) as e:
die("[ERROR]: {}: {}".format(type(e).__name__, e))