-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathimgcrawler.py
More file actions
90 lines (75 loc) · 2.31 KB
/
imgcrawler.py
File metadata and controls
90 lines (75 loc) · 2.31 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import os
import json
import requests
import re
from bs4 import BeautifulSoup
from utils.char import *
from utils.url import *
from utils.handlers import *
from uuid import uuid4
# Get URL from HTML files and save in txt files
# and let it up to shell
# Precedence order:
# 1. command-line
# 2. json file
# TODO:
# * log to the user
# * parameters:
# -s or --skip (skip the prompts)
# -w or --site (site url)
# -u or --url (URL to get)
# -p or --path (path for the image)
config = {
'local': False,
'replace': [],
'site': '',
'local_file': '',
'dirname': '',
'query': 'img',
'savedir':'',
'attr': 'src'
}
with readfile('setup.json', mode='rb', report=True) as json_file:
config = {**config,**json.load(json_file)}
site = (input("Website: ") or config['site']) # input("Website: ")
newdirname = (input("Real URL: ") or config['dirname'])
savedir = (input("Save location: ") or config['savedir'])
selector = (input("Query for (any CSS selector for an <img>): ") or config["query"])
replace = config['replace']
local = config['local']
local_file = config['local_file']
attr = config['attr']
if local:
with readfile(config['local_file'], 'rb', report=True) as html:
dom = BeautifulSoup(html, 'html.parser')
else:
response = makerequest(site)
html = response.text
dom = BeautifulSoup(html, 'html.parser')
sources = [] # store all the images sources
try:
for element in dom.select(selector):
# from docs: https://www.crummy.com/software/BeautifulSoup/bs4/doc/#miscellaneous
if element.get(attr): url = element[attr]
else: continue
urltest = re.sub(r'(\?|#).*', '', url)
# only the supported types by web browsers
isimg = re.search(r'\.(a?png|p?j(fif|pe?g?)|webp|gif|bmp|svg|avif|tiff?|ico)$', urltest)
if not isimg: continue
if not isurl(url): url = makeurl(baseurl=site,path=url) # if there's just the path part
if replace:
url = url.replace(replace[0], replace[1])
sources.append(url)
except Exception:
pass
def get():
global savedir
if not savedir:
if dom.title: savedir = dom.title.string
else: savedir = str(uuid4())
if not os.path.exists(savedir):
os.makedirs(savedir, exist_ok=True)
try:
makerequest(sources=sources, output=savedir)
except KeyError: pass
get()