1
0
Fork 0
Browse Source

Initial commit with all the files.

master
Alexander Andreev 2 years ago
commit
a5028162d8
  1. 5
      .gitignore
  2. 11
      CHANGELOG.md
  3. 21
      COPYING
  4. 13
      Makefile
  5. 33
      README.md
  6. 13
      scrapthechan/__init__.py
  7. 0
      scrapthechan/cli/__init__.py
  8. 116
      scrapthechan/cli/scraper.py
  9. 23
      scrapthechan/fileinfo.py
  10. 81
      scrapthechan/parser.py
  11. 34
      scrapthechan/parsers/__init__.py
  12. 43
      scrapthechan/parsers/dvach.py
  13. 49
      scrapthechan/parsers/fourchan.py
  14. 57
      scrapthechan/parsers/lainchan.py
  15. 96
      scrapthechan/scraper.py
  16. 0
      scrapthechan/scrapers/__init__.py
  17. 15
      scrapthechan/scrapers/basicscraper.py
  18. 42
      setup.cfg
  19. 3
      setup.py

5
.gitignore vendored

@ -0,0 +1,5 @@
.vscode/
build/
dist/
*.egg-info/
__pycache__

11
CHANGELOG.md

@ -0,0 +1,11 @@
# Changelog
## 0.1.0 - 2020-07-08
### Added
- JSON parsers for 4chan.org, lainchan.org and 2ch.hk.
- Basic straightforward scraper that downloads files one by one.
### Issues
- 2ch.hk: I can't figure out what exactly it tells as a size and hash of a file.
Example: file may have a size of 127798 bytes (125K) but 2ch reports 150 and a
hash reported doesn't equal to a computed one.

21
COPYING

@ -0,0 +1,21 @@
The MIT License
Copyright (c) 2020 Alexander "Arav" Andreev <me@arav.top>
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

13
Makefile

@ -0,0 +1,13 @@
build: scrapthechan README.md setup.cfg
python setup.py sdist bdist_wheel
install:
python -m pip install --upgrade dist/scrapthechan-1.0.0-py3-none-any.whl --user
uninstall:
# We change directory so pip uninstall will run, it'll fail otherwise.
@cd ~/
python -m pip uninstall scrapthechan
clean:
rm -rf __pycache__ scrapthechan/__pycache__ scrapthechan/parsers/__pycache__ \
scrapthechan.egg-info build
.PHONY: build

33
README.md

@ -0,0 +1,33 @@
This is a tool for scraping files from imageboards' threads.
It extracts the files from a JSON version of a thread. And then downloads 'em
in a specified output directory or if it isn't specified then creates following
directory hierarchy in a working directory:
<imageboard name>
|-<board name>
|-<thread>
|-[!op.txt]
|-...
|-...
# Usage
```bash
scrapthechan [<url> | <imageboard> <board> <thread>] [-o,--output-dir] [--no-op]
[-v,--version] [-h,--help]
```
There are two ways to pass a thread. One is by passing a full URL of a thread
(`<url>` argument), and the other one is by passing thread in three components:
`<imageboard>` is a name of website (e.g. 4chan), `<board>` is a name of a board
(e.g. wg), and `<thread>` is a number of a thread on that board.
`-o`, `--output-dir` -- output directory where all files will be dumped to.
`--no-op` -- by default OP's post will be saved in a `!op.txt` file. This flag
disables this behaviour. I desided to put an `!` in a name so this file will be
on the top in a directory listing.
`-v`, `--version` prints the version of the program, and `-h`, `--help` prints
help for a program.

13
scrapthechan/__init__.py

@ -0,0 +1,13 @@
__date__ = "8 Jule 2020"
__version__ = "0.1.0"
__author__ = "Alexander \"Arav\" Andreev"
__email__ = "me@arav.top"
__copyright__ = f"Copyright (c) 2020 {__author__} <{__email__}>"
__license__ = \
"""This program is licensed under the terms of the MIT license.
For a copy see COPYING file in a directory of the program, or
see <https://opensource.org/licenses/MIT>"""
VERSION = \
f"ScrapTheChan ver. {__version__} ({__date__})\n\n{__copyright__}\n"\
f"\n{__license__}"

0
scrapthechan/cli/__init__.py

116
scrapthechan/cli/scraper.py

@ -0,0 +1,116 @@
from argparse import ArgumentParser
from os import makedirs
from os.path import join, exists
from re import search
from sys import argv
from typing import List
from scrapthechan import VERSION
from scrapthechan.parser import Parser, ParserThreadNotFoundError
from scrapthechan.parsers import get_parser_by_url, get_parser_by_site, \
SUPPORTED_IMAGEBOARDS
from scrapthechan.scrapers.basicscraper import BasicScraper
__all__ = ["main"]
USAGE = \
"""Usage: scrapthechan [OPTIONS] (URL|)
Options:
\t-h,--help -- print this help and exit;
\t-v,--version -- print program's version and exit;
\t-o,--output-dir -- directory where to place scraped files. By default
\t following structure will be created in current directory:
\t <imageboard>/<board>/<thread>;
\t-N,--no-op -- by default OP's post will be written in !op.txt file. This
\t option disables this behaviour;
Supported imageboards: 4chan.org, 2ch.hk, lainchan.org
"""
def parse_common_arguments(args: str) -> dict:
r = r"(?P<help>-h|--help)|(?P<version>-v|--version)"
argd = search(r, args)
if not argd is None:
argd = argd.groupdict()
return {
"help": not argd["help"] is None,
"version": not argd["version"] is None }
return None
def parse_arguments(args: str) -> dict:
rlink = r"^(https?:\/\/)?(?P<site>[\w.-]+)[ \/](?P<board>\w+)(\S+)?[ \/](?P<thread>\w+)"
link = search(rlink, args)
if not link is None:
link = link.groupdict()
out_dir = search(r"(?=(-o|--output-dir) (?P<outdir>\S+))", args)
return {
"site": None if link is None else link["site"],
"board": None if link is None else link["board"],
"thread": None if link is None else link["thread"],
"no-op": not search(r"-N|--no-op", args) is None,
"output-dir": None if out_dir is None \
else out_dir.groupdict()["outdir"] }
def main() -> None:
cargs = parse_common_arguments(' '.join(argv[1:]))
if not cargs is None:
if cargs["help"]:
print(USAGE)
exit()
elif cargs["version"]:
print(VERSION)
exit()
args = parse_arguments(' '.join(argv[1:]))
if args is None \
or not "site" in args or not "board" in args or not "thread" in args:
print(USAGE)
exit()
try:
parser = get_parser_by_site(args["site"], args["board"], args["thread"])
except NotImplementedError as ex:
print(f"{str(ex)}.")
print(f"Supported image boards are {', '.join(SUPPORTED_IMAGEBOARDS)}")
exit()
except ParserThreadNotFoundError:
print(f"Thread is no longer exist.")
exit()
flen = len(parser.files)
print(f"There are {flen} files in this thread.")
if not args["output-dir"] is None:
save_dir = args["output-dir"]
else:
save_dir = join(parser.imageboard, parser.board,
parser.thread)
print(f"They will be saved in {save_dir}.")
makedirs(save_dir, exist_ok=True)
if not args["no-op"]:
print("Writing OP... ", end='')
if not exists(join(save_dir, "!op.txt")):
with open(join(save_dir, "!op.txt"), 'w') as opf:
opf.write(f"{parser.op}\n")
print("Done.")
else:
print("Exists.")
scraper = BasicScraper(save_dir, parser.files, \
lambda i: print(f"{i}/{flen}", end="\r"))
scraper.run()
if __name__ == "__main__":
main()

23
scrapthechan/fileinfo.py

@ -0,0 +1,23 @@
"""FileInfo object stores all needed information about a file."""
__all__ = ["FileInfo"]
class FileInfo:
"""Stores all needed information about a file.
Arguments:
- `name` -- name of a file;
- `size` -- size of a file;
- `dlurl` -- full download URL for a file;
- `hash_value` -- hash sum of a file;
- `hash_algo` -- hash algorithm used (e.g. md5).
"""
def __init__(self, name: str, size: int, dlurl: str,
hash_value: str, hash_algo: str) -> None:
self.name = name
self.size = size
self.dlurl = dlurl
self.hash_value = hash_value
self.hash_algo = hash_algo

81
scrapthechan/parser.py

@ -0,0 +1,81 @@
"""Base `Parser` class for JSON parsers to inherit."""
from itertools import chain
from json import loads
from re import findall, match
from typing import List, Optional
from urllib.request import urlopen, urlretrieve
from scrapthechan.fileinfo import FileInfo
__all__ = ["Parser", "ParserThreadNotFoundError"]
class ParserThreadNotFoundError(Exception):
pass
class Parser:
"""Base class for all parsers.
It fetches JSON of a specified thread and collects all the files from it
into a list of the `FileInfo` objects.
Also it extracts OP's post, that may come handy if you do bulk scraping.
Arguments:
board -- is a name of a board on an image board;
thread -- is a name of a thread inside a board;
posts -- is a list of posts in form of dictionaries exported from a JSON;
skip_posts -- number of posts to skip.
All the extracted files will be stored as the `FileInfo` objects."""
__url_thread_json: str = "https://example.org/{board}/{thread}.json"
__url_file_link: str = None
def __init__(self, board: str, thread: str, posts: List[dict],
skip_posts: Optional[int] = None) -> None:
self._board = board
self._thread = thread
self._op_post = posts[0]
if not skip_posts is None:
posts = posts[skip_posts:]
self._files = list(chain.from_iterable(filter(None, \
map(self._parse_post, posts))))
@property
def imageboard(self) -> str:
"""Returns image board's name."""
return NotImplementedError
@property
def board(self) -> str:
"""Returns a name of a board of image board."""
return self._board
@property
def thread(self) -> str:
"""Returns a name of thread from a board."""
return self._thread
@property
def op(self) -> str:
"""Returns OP's post as combination of subject and comment separated
by a new line."""
raise NotImplementedError
@property
def files(self) -> List[FileInfo]:
"""Returns a list of retrieved files as `FileInfo` objects."""
return self._files
def _get_json(self, thread_url: str) -> dict:
"""Gets JSON version of a thread and converts it in a dictionary."""
try:
with urlopen(thread_url) as url:
return loads(url.read().decode('utf-8'))
except:
raise ParserThreadNotFoundError
def _parse_post(self, post: dict) -> List[FileInfo]:
"""Parses a single post and extracts files into `FileInfo` object."""
raise NotImplementedError

34
scrapthechan/parsers/__init__.py

@ -0,0 +1,34 @@
"""Here are defined the JSON parsers for imageboards."""
from re import search
from typing import List
from scrapthechan.parser import Parser
__all__ = ["SUPPORTED_IMAGEBOARDS", "get_parser_by_url", "get_parser_by_site"]
SUPPORTED_IMAGEBOARDS: List[str] = ["4chan.org", "lainchan.org", "2ch.hk"]
def get_parser_by_url(url: str) -> Parser:
"""Parses URL and extracts from it site name, board and thread.
And then returns initialised Parser object for detected imageboard."""
URLRX = r"https?:\/\/(?P<s>[\w\.]+)\/(?P<b>\w+)\/(?:\w+)?\/(?P<t>\w+)"
site, board, thread = search(URLRX, url).groups()
return get_parser_by_site(site, board, thread)
def get_parser_by_site(site: str, board: str, thread: str) -> Parser:
"""Returns an initialised parser for `site` with `board` and `thread`."""
if site in ['boards.4chan.org', 'boards.4channel.org',
'4chan', '4chan.org']:
from .fourchan import FourChanParser
return FourChanParser(board, thread)
elif site in ['lainchan.org', 'lainchan']:
from .lainchan import LainchanParser
return LainchanParser(board, thread)
elif site in ['2ch.hk', '2ch']:
from .dvach import DvachParser
return DvachParser(board, thread)
else:
raise NotImplementedError(f"Parser for {site} is not implemented")

43
scrapthechan/parsers/dvach.py

@ -0,0 +1,43 @@
from re import match
from typing import List, Optional
from scrapthechan.fileinfo import FileInfo
from scrapthechan.parser import Parser
__all__ = ["DvachParser"]
class DvachParser(Parser):
"""JSON parser for 2ch.hk image board."""
__url_thread_json = "https://2ch.hk/{board}/res/{thread}.json"
__url_file_link = "https://2ch.hk"
def __init__(self, board: str, thread: str,
skip_posts: Optional[int] = None) -> None:
posts = self._get_json(self.__url_thread_json.format(board=board, \
thread=thread))['threads'][0]['posts']
super(DvachParser, self).__init__(board, thread, posts, skip_posts)
@property
def imageboard(self) -> str:
return "2ch.hk"
@property
def op(self) -> str:
return f"{self._op_post['subject']}\n{self._op_post['comment']}"
def _parse_post(self, post) -> Optional[List[FileInfo]]:
if not 'files' in post: return None
files = []
for f in post['files']:
if match(f['fullname'], r"^image\.\w+$") is None:
fullname = f['fullname']
else:
fullname = f['name']
# Here's same thing as 4chan. 2ch.hk also has md5 field, so it is
# completely fine to hardcode `hash_algo`.
files.append(FileInfo(fullname, f['size'],
f"{self.__url_file_link}{f['path']}",
f['md5'], 'md5'))
return files

49
scrapthechan/parsers/fourchan.py

@ -0,0 +1,49 @@
from re import match
from typing import List, Optional
from scrapthechan.fileinfo import FileInfo
from scrapthechan.parser import Parser
__all__ = ["FourChanParser"]
class FourChanParser(Parser):
"""JSON parser for 4chan.org image board."""
__url_thread_json = "https://a.4cdn.org/{board}/thread/{thread}.json"
__url_file_link = "https://i.4cdn.org/{board}/{filename}"
def __init__(self, board: str, thread: str,
skip_posts: Optional[int] = None) -> None:
posts = self._get_json(self.__url_thread_json.format(board=board, \
thread=thread))['posts']
super(FourChanParser, self).__init__(board, thread, posts, skip_posts)
@property
def imageboard(self) -> str:
return "4chan.org"
@property
def op(self) -> str:
if 'sub' in self._op_post:
return f"{self._op_post['sub']}\n{self._op_post['com']}"
else:
return self._op_post['com']
def _parse_post(self, post: dict) -> List[FileInfo]:
if not 'tim' in post: return None
dlfname = f"{post['tim']}{post['ext']}"
if "filename" in post:
if match(post['filename'], r"^image\.\w+$") is None:
filename = dlfname
else:
filename = f"{post['filename']}{post['ext']}"
# Hash algorithm is hardcoded since it is highly unlikely that it will
# be changed in foreseeable future. And if it'll change then this line
# will be necessarily updated anyway.
return [FileInfo(filename, post['fsize'],
self.__url_file_link.format(board=self.board, filename=dlfname),
post['md5'], 'md5')]

57
scrapthechan/parsers/lainchan.py

@ -0,0 +1,57 @@
from re import match
from typing import List, Optional
from scrapthechan.parser import Parser
from scrapthechan.fileinfo import FileInfo
__all__ = ["LainchanParser"]
class LainchanParser(Parser):
"""JSON parser for lainchan.org image board.
JSON structure is identical to 4chan.org's, so this parser is just inherited
from 4chan.org's parser and only needed things are redefined.
"""
__url_thread_json = "https://lainchan.org/{board}/res/{thread}.json"
__url_file_link = "https://lainchan.org/{board}/src/{filename}"
def __init__(self, board: str, thread: str,
skip_posts: Optional[int] = None) -> None:
posts = self._get_json(self.__url_thread_json.format(board=board, \
thread=thread))['posts']
super(LainchanParser, self).__init__(board, thread, posts, skip_posts)
@property
def imageboard(self) -> str:
return "lainchan.org"
def _parse_post(self, post) -> List[FileInfo]:
if not 'tim' in post: return None
dlfname = f"{post['tim']}{post['ext']}"
if "filename" in post:
if match(post['filename'], r"^image\.\w+$") is None:
filename = dlfname
else:
filename = f"{post['filename']}{post['ext']}"
files = []
files.append(FileInfo(filename, post['fsize'],
self.__url_file_link.format(board=self.board, filename=dlfname),
post['md5'], 'md5'))
if "extra_files" in post:
for f in post["extra_files"]:
dlfname = f"{f['tim']}{f['ext']}"
if "filename" in post:
if match(post['filename'], r"^image\.\w+$") is None:
filename = dlfname
else:
filename = f"{post['filename']}{post['ext']}"
dlurl = self.__url_file_link.format(board=self.board, \
filename=dlfname)
files.append(FileInfo(filename, f['fsize'], \
dlurl, f['md5'], 'md5'))
return files

96
scrapthechan/scraper.py

@ -0,0 +1,96 @@
"""Base Scraper implementation."""
from base64 import b64encode
from os import remove, stat
from os.path import exists, join, getsize
import re
from typing import List, Callable
from urllib.request import urlretrieve, URLopener
import hashlib
from scrapthechan import __version__
from scrapthechan.fileinfo import FileInfo
__all__ = ["Scraper"]
class Scraper:
"""Base scraper implementation.
Arguments:
save_directory -- a path to a directory where file will be
saved;
files -- a list of FileInfo objects;
download_progress_callback -- a callback function that will be called
for each file started downloading.
"""
def __init__(self, save_directory: str, files: List[FileInfo],
download_progress_callback: Callable[[int], None] = None) -> None:
self._save_directory = save_directory
self._files = files
self._url_opener = URLopener()
self._url_opener.version = f"ScrapTheChan/{__version__}"
self._progress_callback = download_progress_callback
def run(self):
raise NotImplementedError
def _same_filename(self, filename: str, path: str) -> str:
"""Check if there is a file with same name. If so then add incremental
number enclosed in brackets to a name of a new one."""
newname = filename
while exists(join(path, newname)):
has_extension = newname.rfind(".") != -1
if has_extension:
l, r = newname.rsplit(".", 1)
lbracket = l.rfind("(")
if lbracket == -1:
newname = f"{l}(1).{r}"
else:
num = l[lbracket+1:-1]
if num.isnumeric():
newname = f"{l[:lbracket]}({int(num)+1}).{r}"
else:
newname = f"{l}(1).{r}"
else:
lbracket = l.rfind("(")
if lbracket == -1:
newname = f"{newname}(1)"
else:
num = newname[lbracket+1:-1]
if num.isnumeric():
newname = f"{newname[:lbracket]}({int(num)+1})"
return newname
def _hash_file(self, filename: str, hash_algo: str = "md5",
blocksize: int = 1048576) -> (str, str):
"""Compute hash of a file."""
hash_func = hashlib.new(hash_algo)
with open(filename, 'rb') as f:
buf = f.read(blocksize)
while len(buf) > 0:
hash_func.update(buf)
buf = f.read(blocksize)
return hash_func.hexdigest(), hash_func.digest()
def _is_file_ok(self, f: FileInfo, filepath: str) -> bool:
"""Check if a file exist and isn't broken."""
if not exists(filepath):
return False
computed_size = getsize(filepath)
is_size_match = f.size == computed_size \
or f.size == round(computed_size / 1024)
hexdig, dig = self._hash_file(filepath, f.hash_algo)
is_hash_match = f.hash_value == hexdig \
or f.hash_value == b64encode(dig).decode()
return is_size_match and is_hash_match
def _download_file(self, f: FileInfo):
"""Download a single file."""
filepath = join(self._save_directory, f.name)
if self._is_file_ok(f, filepath):
return True
elif exists(filepath):
filepath = join(self._save_directory, \
self._same_filename(f.name, self._save_directory))
self._url_opener.retrieve(f.dlurl, filepath)

0
scrapthechan/scrapers/__init__.py

15
scrapthechan/scrapers/basicscraper.py

@ -0,0 +1,15 @@
"""Implementation of basic sequential one-threaded scraper that downloads
files one by one."""
from scrapthechan.scraper import Scraper
__all__ = ["BasicScraper"]
class BasicScraper(Scraper):
def run(self):
"""Download files one by one."""
for i, f in enumerate(self._files, start=1):
if not self._progress_callback is None:
self._progress_callback(i)
self._download_file(f)

42
setup.cfg

@ -0,0 +1,42 @@
[metadata]
name = scrapthechan
version = attr: scrapthechan.__version__
description =
Scrap the files posted in a thread on an imageboard. Currently supports
4chan.org, lainchan.org and 2ch.hk.
long_description = file: README.md
long_description_content_type = text/markdown
author = Alexander "Arav" Andreev
author_email = me@arav.top
url = https://arav.top
keywords =
scraper
imageboard
4chan
2ch
lainchan
license = MIT
license_file = COPYING
classifiers =
Development Status :: 2 - Pre-Alpha
Environment :: Console
Intended Audience :: End Users/Desktop
License :: Other/Proprietary License
Natural Language :: English
Operating System :: OS Independent
Programming Language :: Python :: 3.7
Programming Language :: Python :: 3.8
Topic :: Utilities
[options]
zip_safe = False
python_requires = >=3.7
include_package_data = True
packages = find:
[options.package_data]
* = COPYING, README.md
[options.entry_points]
console_scripts =
scrapthechan = scrapthechan.cli.scraper:main

3
setup.py

@ -0,0 +1,3 @@
from setuptools import setup
setup()
Loading…
Cancel
Save