Add generic SOCKS proxy support, variable renaming

Now you can run your workers through a SOCKS proxy with the -S flag!
Authentication at the command-line through fully-qualified URLs as well
as interactive authentication are now supported.

I may revert the try/except block in LinodeProxy's constructor should it
be found to be non-advantageous in further testing.  Performance in
creating large quantities of Linode proxies is still dog slow, may use
concurrent.futures to speed this up in the future (hah) -- this will
require a major rework of the downloader main().
This commit is contained in:
Andrea Rogers 2021-10-19 23:34:30 -04:00
commit cf2a50ba93
5 changed files with 332 additions and 59 deletions

View file

@ -13,7 +13,8 @@
</p>
`squid-dl` is a massively parallel
[yt-dlp](https://github.com/yt-dlp/yt-dlp)-based YouTube downloader.
[yt-dlp](https://github.com/yt-dlp/yt-dlp)-based YouTube downloader useful for
downloading large video playlists a fast internet connection.
## Installation
Run the `setup.py`, which will install `squid-dl` and its two dependencies:
@ -22,7 +23,7 @@ Run the `setup.py`, which will install `squid-dl` and its two dependencies:
$ python3 setup.py install
```
### Linode Setup
### Linode Proxy Setup
If you want to use the Linode SOCKS proxy feature, be sure to configure the
`linode-cli` first:
```
@ -87,7 +88,43 @@ playlist example, we'll spawn 12 workers:
For more information see the built-in help by running `squid-dl -h`.
## Linode Proxying
### SOCKS Proxying
For those with access to a dedicated SOCKS proxy already, you can use
`squid-dl`'s `-S` and a fully-qualified SOCKS4, SOCKS4A, or SOCKS5 proxy URL
to download your playlists through a proxy! Here's an example using NordVPN's
SOCKS5 proxy:
```
(.venv) $ squid-dl -S socks5://us.socks.nordhold.net:1080 -n 12 Mems.json
[INFO]: Starting squid-dl...
[INFO]: saving videos to "Mems" directory
SOCKS username: 0asFVrZt0bw1ucPvQRKiUe87
SOCKS password:
...
[download] Download completed
[INFO]: Worker 1667326 done...
[INFO]: Worker 1667396 done...
[INFO]: Worker 1667484 done...
[download] Download completed
[download] Download completed
[INFO]: Worker 1667352 done...
[INFO]: Worker 1667421 done...
[INFO]: All done!
```
You can also add a username and password to SOCKS5 and SOCKS4A proxy URLs in a
format like this:
```
socks5://username:password@hostname:port
```
**SECURITY NOTE:** typing in usernames and passwords this way is considered
insecure, as they will likely end up in your shell's history file completely
unprotected and in the clear (☹). It is generally recommended to input the
username and password interactively unless you are scripting `squid-dl`.
### Linode Proxying
With the `-L` option, you can run each worker through its own Linode-powered
SSH-tunneled SOCKSv5 proxy! `squid-dl` will make an temporary SSH key in
the current working directory and then get to work spinning up Linodes and

View file

@ -42,6 +42,8 @@ from yt_dlp.utils import encodeFilename, sanitize_path
from yt_dlp.extractor.common import InfoExtractor as IE
from .linode import LinodeProxy
from .proxy import Proxy
from .socks import SocksProxy
from .util import die, eprint, runcmd
@ -49,7 +51,7 @@ def do_download(
entry_q: Queue,
opts: argparse.Namespace,
sub_langs: [str],
proxy: LinodeProxy = None,
proxy: Proxy = None,
):
sub_opts = {
@ -330,7 +332,7 @@ def resume_preprocess(entries: [dict]) -> list:
return unfinished_entries
def validate_proxy(proxy: LinodeProxy) -> LinodeProxy:
def validate_linode_proxy(proxy: LinodeProxy) -> LinodeProxy:
if not proxy.start():
eprint(
"[WARN]: "
@ -339,7 +341,7 @@ def validate_proxy(proxy: LinodeProxy) -> LinodeProxy:
port = proxy.proxy_port
proxy.cleanup()
proxy = LinodeProxy(proxy_port=port)
return validate_proxy(proxy)
return validate_linode_proxy(proxy)
else:
print(
"[INFO]: SOCKS validation succeeded on port {} from ID {}".format(
@ -349,7 +351,7 @@ def validate_proxy(proxy: LinodeProxy) -> LinodeProxy:
return proxy
def cleanup(workers: [Process], proxies: [LinodeProxy]) -> None:
def cleanup(workers: [Process], linode_proxies: [LinodeProxy]) -> None:
if len(workers) > 0:
for worker in workers:
if worker.is_alive():
@ -360,16 +362,16 @@ def cleanup(workers: [Process], proxies: [LinodeProxy]) -> None:
)
worker.terminate()
if len(proxies) > 0:
if len(linode_proxies) > 0:
print("[CLEANUP]: Deleting Linode proxies...")
for proxy in proxies:
for proxy in linode_proxies:
proxy.cleanup()
def parse_args(args: list, name: str):
parser = argparse.ArgumentParser(prog=name)
group = parser.add_argument_group("Proxy settings")
group = parser.add_mutually_exclusive_group(required=False)
group.add_argument(
"-L",
"--linode-proxy",
@ -380,12 +382,26 @@ def parse_args(args: list, name: str):
+ "for more information.",
)
group.add_argument(
"-S",
"--socks-proxy",
type=str,
default=None,
help="Run workers through a SOCKS proxy. Requires a fully-qualified "
+ 'proxy URL (e.g. "socks5://user:pass@hostname:port" or '
+ '"socks5://hostname:port").\n'
+ "Be mindful of your shell's history file when entering passwords on "
+ "the command line. If this script encounters a proxy that requires "
+ "authentication, it will prompt the user for a password "
+ "interactively, as well.",
)
parser.add_argument(
"-p",
"--proxy-base-port",
type=int,
default=1337,
help="Port number proxy ports are derived from, does nothing without "
"enabling a type of proxy (like --linode-proxy).",
help="Port number that local Linode-powered proxy ports are derived "
+ "from, does nothing without "
+ "enabling --linode-proxy (aka. -L).",
)
parser.add_argument(
"--resume-dump",
@ -446,9 +462,7 @@ def main(args: [str], name: str) -> int:
print('[INFO]: saving videos to "{}" directory'.format(dirname))
if not (os.path.exists(dirname) and os.path.isdir(dirname)):
os.mkdir(dirname)
os.chdir(dirname)
else:
os.chdir(dirname)
playlist_size = len(info_dict["entries"])
info_dict["entries"] = resume_preprocess(info_dict["entries"])
@ -462,7 +476,7 @@ def main(args: [str], name: str) -> int:
)
)
if opts.resume_dump:
rdump = open("resume.json", mode="w")
rdump = open(info_dict["title"] + ".resume.json", mode="w")
rdump.write(j.dumps(info_dict, sort_keys=True, indent=2))
rdump.close()
@ -476,16 +490,20 @@ def main(args: [str], name: str) -> int:
base_port = 1337
workers = []
proxies = []
linode_proxies = []
if opts.socks_proxy is not None:
socks_proxy = SocksProxy(url=opts.socks_proxy)
try:
for n in range(n_workers):
port = base_port + n
if opts.linode_proxy:
proxies.append(
linode_proxies.append(
LinodeProxy(proxy_port=port, pubkey_path=pubkey_path)
)
worker_args = (entry_q, opts, sub_langs, proxies[n])
worker_args = (entry_q, opts, sub_langs, linode_proxies[n])
elif opts.socks_proxy is not None:
worker_args = (entry_q, opts, sub_langs, socks_proxy)
else:
worker_args = (entry_q, opts, sub_langs)
@ -496,7 +514,7 @@ def main(args: [str], name: str) -> int:
)
)
if len(proxies) > 0:
if len(linode_proxies) > 0:
if not (
os.path.isfile(pubkey_path)
or os.path.isfile(os.path.splitext(pubkey_path)[0])
@ -512,7 +530,7 @@ def main(args: [str], name: str) -> int:
print(".", end="")
temp_list = []
for proxy_idx in nodes_to_ping:
if proxies[proxy_idx].get_status() != "running":
if linode_proxies[proxy_idx].get_status() != "running":
temp_list.append(proxy_idx)
sleep(0.2)
nodes_to_ping = temp_list
@ -521,9 +539,11 @@ def main(args: [str], name: str) -> int:
while not entry_q.full():
sleep(0.2)
os.chdir(dirname)
for i in range(n_workers):
if len(proxies) > 0:
proxies[i] = validate_proxy(proxies[i])
if len(linode_proxies) > 0:
linode_proxies[i] = validate_linode_proxy(linode_proxies[i])
seconds = randint(0, 1)
else:
seconds = randint(1, 6)
@ -535,7 +555,7 @@ def main(args: [str], name: str) -> int:
except KeyboardInterrupt:
eprint("\n[CLEANUP]: Interrupted, cleaning up...")
cleanup(workers, proxies)
cleanup(workers, linode_proxies)
if entry_getter.is_alive():
print(
"[CLEANUP]: Terminating queue worker {}".format(
@ -546,6 +566,6 @@ def main(args: [str], name: str) -> int:
return 1
print("[INFO]: All done!")
cleanup(workers, proxies)
cleanup(workers, linode_proxies)
return 0

View file

@ -25,10 +25,11 @@ import subprocess
from time import sleep
import typing
from .proxy import Proxy
from .util import eprint, runcmd
class LinodeProxy:
class LinodeProxy(Proxy):
user_made = False
def __init__(
@ -39,42 +40,45 @@ class LinodeProxy:
debug: bool = False,
exclusive: bool = True,
):
self.proxy_port = proxy_port
self.proxy_user = proxy_user
self.pubkey_path = pubkey_path
self.debug = debug
self.exclusive = exclusive
try:
self.proxy_port = proxy_port
self.proxy_user = proxy_user
self.pubkey_path = pubkey_path
self.debug = debug
self.exclusive = exclusive
self.proxy_url = "socks5://127.0.0.1:" + str(self.proxy_port)
self.proxy_url = "socks5://127.0.0.1:" + str(self.proxy_port)
self.ssh_prefix = (
'ssh -o "UserKnownHostsFile=/dev/null" '
+ '-o "StrictHostKeyChecking=no" -i '
+ splitext(self.pubkey_path)[0]
+ " "
)
pubfile = open(self.pubkey_path, mode="r")
self.pubkey = pubfile.readline().rstrip()
pubfile.close()
self.ssh_prefix = (
'ssh -o "UserKnownHostsFile=/dev/null" '
+ '-o "StrictHostKeyChecking=no" -i '
+ splitext(self.pubkey_path)[0]
+ " "
)
pubfile = open(self.pubkey_path, mode="r")
self.pubkey = pubfile.readline().rstrip()
pubfile.close()
self.passwd = runcmd(
"echo $(cat /dev/random | strings | head -c 512 | "
+ "grep -oE '[a-zA-Z0-9#%!]') | sed 's/\s//g' | head -c 32;"
).decode()
self.passwd = runcmd(
"echo $(cat /dev/random | strings | head -c 512 | "
+ "grep -oE '[a-zA-Z0-9#%!]') | sed 's/\s//g' | head -c 32;"
).decode()
create_cmd = (
"linode-cli --json linodes create "
+ "--image linode/arch "
+ "--authorized_keys "
+ '"'
+ self.pubkey
+ '"'
+ ' --root_pass "'
+ self.passwd
+ '"'
)
self.info = j.loads(runcmd(create_cmd).decode())[0]
print("[INFO]: Created Linode {}.".format(self.info["id"]))
create_cmd = (
"linode-cli --json linodes create "
+ "--image linode/arch "
+ "--authorized_keys "
+ '"'
+ self.pubkey
+ '"'
+ ' --root_pass "'
+ self.passwd
+ '"'
)
self.info = j.loads(runcmd(create_cmd).decode())[0]
print("[INFO]: Created Linode {}.".format(self.info["id"]))
except KeyboardInterrupt:
self.cleanup()
def find_linode(self) -> bool:
linodes = j.loads(runcmd("linode-cli --json linodes list").decode())
@ -161,6 +165,8 @@ class LinodeProxy:
def test_proxy(self) -> bool:
sen = struct.pack("BBB", 0x05, 0x01, 0x00)
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.settimeout(2)
try:
s.connect(("127.0.0.1", self.proxy_port))
except ConnectionRefusedError as e:
@ -170,8 +176,18 @@ class LinodeProxy:
)
)
return False
s.sendall(sen)
data = s.recv(2)
for n in range(3):
s.sendall(sen)
try:
data = s.recv(2)
break
except socket.timeout:
if n == 2:
eprint(
"[ERROR]: Linode SOCKS timed out after three attempts!",
)
return False
version, auth = struct.unpack("BB", data)
if version == 5 and auth == 0:

38
squid_dl/proxy.py Normal file
View file

@ -0,0 +1,38 @@
"""
squid_dl Proxy base class (for typing)
"""
class Proxy:
proxy_url = None
exclusive = False
def cleanup(self) -> None:
pass

162
squid_dl/socks.py Normal file
View file

@ -0,0 +1,162 @@
"""
Generic SOCKS proxy support.
"""
from getpass import getpass
import json as j
import typing
import os
import socket
import struct
from urllib.parse import unquote_plus, urlparse
from yt_dlp.socks import ProxyType, Socks4Error, Socks5Error, sockssocket
from .proxy import Proxy
from .util import die, eprint
class SocksUnknownSchemeError(Exception):
pass
def test_proxy(host: str, port: int):
sen = struct.pack("BBB", 0x05, 0x01, 0x00)
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.settimeout(2)
try:
s.connect((host, port))
except ConnectionRefusedError as e:
die(
'[ERROR]: Got "{}" when connecting to {} on port {}'.format(
e, host, port
)
)
for n in range(3):
s.sendall(sen)
try:
data = s.recv(2)
break
except socket.timeout:
if n == 2:
eprint("[ERROR]: SOCKS proxy timed out after three attempts!")
exit(1)
s.close()
return struct.unpack("BB", data) # (version, auth)
class SocksProxy(Proxy):
"""
The ``params`` dict can either contain the single element ``url`` or:
* ``host`` (string)
* ``port`` (string)
* ``scheme`` (string, optional)
* ``user`` (string, optional)
* ``password`` (string, optional)
"""
def __init__(
self, url: str = None, params: dict = None, debug: bool = False
):
self.debug = debug
if url is not None:
self.init_from_url(url)
else:
self.init_from_params(params)
self.setup()
def get_socks_proxytype(self):
if self.scheme.lower() == "socks5":
return ProxyType.SOCKS5
elif self.scheme.lower() in ("socks", "socks4"):
return ProxyType.SOCKS4
elif self.scheme.lower() == "socks4a":
return ProxyType.SOCKS4A
else:
eprint("[ERROR]: Unknown scheme in proxy URL!")
raise SocksUnknownSchemeError
def init_from_url(self, url: str):
self.proxy_url = url
url_pieces = urlparse(url)
self.scheme = url_pieces.scheme
self.host = url_pieces.hostname
self.port = url_pieces.port or 1080
self.user = url_pieces.username
self.password = url_pieces.password
def init_from_params(self, params: dict):
self.host = params["host"]
self.port = params["port"] if "port" in params else 1080
if "user" in params and "password" in params:
self.user = params["user"]
self.password = params["password"]
authstr = self.user + ":" + self.password + "@"
else:
self.user = None
self.password = None
authstr = ""
self.scheme = (
params["scheme"].lower() if "scheme" in params else "socks5"
)
self.proxy_url = (
self.scheme + "://" + authstr + self.host + ":" + str(self.port)
)
def get_creds(self):
self.username = input("SOCKS username: ")
self.password = getpass(prompt="SOCKS password: ")
self.proxy_url = (
self.scheme
+ "://"
+ self.username
+ ":"
+ self.password
+ "@"
+ self.host
+ ":"
+ str(self.port)
)
def setup(self):
version, auth = test_proxy(host=self.host, port=self.port)
if auth != 0 and (self.user is None or self.password is None):
self.get_creds()
def unquote_if_non_empty(s):
if not s:
return s
return unquote_plus(s)
proxy_args = (
self.get_socks_proxytype(),
self.host,
self.port or 1080,
True, # Remote DNS
unquote_if_non_empty(self.username),
unquote_if_non_empty(self.password),
)
testsock = sockssocket()
testsock.setproxy(*proxy_args)
try:
testsock.connect((self.host, self.port))
testsock.close()
except (Socks4Error, Socks5Error) as e:
die("[ERROR]: {}: {}".format(type(e).__name__, e))