1
0
Fork 0

Added support for lolifox.cc. Fixed User-Agent usage, so it applied correctly everywhere now.

master
Alexander Andreev 2 years ago
parent 7825b53121
commit a106d5b739
  1. 13
      CHANGELOG.md
  2. 2
      Makefile
  3. 3
      README.md
  4. 4
      scrapthechan/__init__.py
  5. 6
      scrapthechan/parser.py
  6. 5
      scrapthechan/parsers/__init__.py
  7. 65
      scrapthechan/parsers/lolifox.py
  8. 1
      scrapthechan/scraper.py
  9. 15
      scrapthechan/scrapers/basicscraper.py
  10. 1
      setup.cfg

@ -1,5 +1,16 @@
# Changelog
## 0.3 - 2020-09-09
### Added
- Parser for lolifox.cc.
### Removed
- BasicScraper. Not needed anymore, there is a faster threaded version.
### Fixed
- Now User-Agent is correctly applied everywhere.
## 0.2.2 - 2020-07-20
### Added
- Parser for 8kun.top.
@ -14,11 +25,13 @@
- Consider that issue with size on 2ch.hk. Usually it really tells the size in
kB. The problem is that sometimes it just wrong.
## 0.2.1 - 2020-07-18
### Changed
- Now program tells you what thread doesn't exist or about to be scraped. That
is useful in batch processing with scripts.
## 0.2.0 - 2020-07-18
### Added
- Threaded version of the scraper, so now it is fast as heck!

@ -1,7 +1,7 @@
build: scrapthechan README.md setup.cfg
python setup.py sdist bdist_wheel
install:
python -m pip install --upgrade dist/scrapthechan-0.2.2-py3-none-any.whl --user
python -m pip install --upgrade dist/scrapthechan-0.3-py3-none-any.whl --user
uninstall:
# We change directory so pip uninstall will run, it'll fail otherwise.
@cd ~/

@ -36,4 +36,5 @@ help for a program.
- [4chan.org](https://4chan.org) since 0.1.0
- [lainchan.org](https://lainchan.org) since 0.1.0
- [2ch.hk](https://2ch.hk) since 0.1.0
- [8kun.top](https://8kun.top) since 0.2.2
- [8kun.top](https://8kun.top) since 0.2.2
- [lolifox.cc](https://lolifox.cc) since 0.3

@ -1,5 +1,5 @@
__date__ = "20 July 2020"
__version__ = "0.2.2"
__date__ = "9 September 2020"
__version__ = "0.3"
__author__ = "Alexander \"Arav\" Andreev"
__email__ = "me@arav.top"
__copyright__ = f"Copyright (c) 2020 {__author__} <{__email__}>"

@ -4,8 +4,9 @@ from itertools import chain
from json import loads
from re import findall, match
from typing import List, Optional
from urllib.request import urlopen, urlretrieve
from urllib.request import urlopen, Request
from scrapthechan import USER_AGENT
from scrapthechan.fileinfo import FileInfo
@ -71,7 +72,8 @@ class Parser:
def _get_json(self, thread_url: str) -> dict:
"""Gets JSON version of a thread and converts it in a dictionary."""
try:
with urlopen(thread_url) as url:
req = Request(thread_url, headers={'User-Agent': USER_AGENT})
with urlopen(req) as url:
return loads(url.read().decode('utf-8'))
except:
raise ThreadNotFoundError

@ -9,7 +9,7 @@ __all__ = ["SUPPORTED_IMAGEBOARDS", "get_parser_by_url", "get_parser_by_site"]
SUPPORTED_IMAGEBOARDS: List[str] = ["4chan.org", "lainchan.org", "2ch.hk", \
"8kun.top"]
"8kun.top", "lolifox.cc"]
def get_parser_by_url(url: str) -> Parser:
@ -33,5 +33,8 @@ def get_parser_by_site(site: str, board: str, thread: str) -> Parser:
elif '8kun' in site:
from .eightkun import EightKunParser
return EightKunParser(board, thread)
elif 'lolifox' in site:
from .lolifox import LolifoxParser
return LolifoxParser(board, thread)
else:
raise NotImplementedError(f"Parser for {site} is not implemented")

@ -0,0 +1,65 @@
from re import match
from typing import List, Optional
from scrapthechan.parser import Parser
from scrapthechan.fileinfo import FileInfo
__all__ = ["LolifoxParser"]
class LolifoxParser(Parser):
"""JSON parser for lolifox.cc image board.
JSON structure is identical to lainchan.org.
"""
__url_thread_json = "https://lolifox.cc/{board}/res/{thread}.json"
__url_file_link = "https://lolifox.cc/{board}/src/{filename}"
def __init__(self, board: str, thread: str,
skip_posts: Optional[int] = None) -> None:
posts = self._get_json(self.__url_thread_json.format(board=board, \
thread=thread))['posts']
super(LolifoxParser, self).__init__(board, thread, posts, skip_posts)
@property
def imageboard(self) -> str:
return "lolifox.cc"
@property
def op(self) -> Optional[str]:
op = ""
if 'sub' in self._op_post:
op = f"{self._op_post['sub']}\n"
if 'com' in self._op_post:
op += self._op_post['com']
return op if not op == "" else None
def _parse_post(self, post) -> List[FileInfo]:
if not 'tim' in post: return None
dlfname = f"{post['tim']}{post['ext']}"
if "filename" in post:
if match(post['filename'], r"^image\.\w{1,4}$") is None:
filename = dlfname
else:
filename = f"{post['filename']}{post['ext']}"
files = []
files.append(FileInfo(filename, post['fsize'],
self.__url_file_link.format(board=self.board, filename=dlfname),
post['md5'], 'md5'))
if "extra_files" in post:
for f in post["extra_files"]:
dlfname = f"{f['tim']}{f['ext']}"
if "filename" in post:
if match(post['filename'], r"^image\.\w+$") is None:
filename = dlfname
else:
filename = f"{post['filename']}{post['ext']}"
dlurl = self.__url_file_link.format(board=self.board, \
filename=dlfname)
files.append(FileInfo(filename, f['fsize'], \
dlurl, f['md5'], 'md5'))
return files

@ -29,6 +29,7 @@ class Scraper:
self._save_directory = save_directory
self._files = files
self._url_opener = URLopener()
self._url_opener.addheaders = [('User-Agent', USER_AGENT)]
self._url_opener.version = USER_AGENT
self._progress_callback = download_progress_callback

@ -1,15 +0,0 @@
"""Implementation of basic sequential one-threaded scraper that downloads
files one by one."""
from scrapthechan.scraper import Scraper
__all__ = ["BasicScraper"]
class BasicScraper(Scraper):
def run(self):
"""Download files one by one."""
for i, f in enumerate(self._files, start=1):
if not self._progress_callback is None:
self._progress_callback(i)
self._download_file(f)

@ -14,6 +14,7 @@ keywords =
2ch.hk
lainchan.org
8kun.top
lolifox.cc
license = MIT
license_file = COPYING
classifiers =

Loading…
Cancel
Save