imgcrawler/imgcrawler.py at main · BenjamimCS/imgcrawler · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import os
import json
import requests
import re
from bs4 import BeautifulSoup
from utils.char import *
from utils.url  import *
from utils.handlers import *
from uuid import uuid4

# Get URL from HTML files and save in txt files
# and let it up to shell
# Precedence order:
#   1. command-line
#   2. json file
# TODO:
#   * log to the user
#   * parameters:
#     -s or --skip (skip the prompts)
#     -w or --site (site url)
#     -u or --url  (URL to get)
#     -p or --path (path for the image)
config = {
  'local': False,
  'replace': [],
  'site': '',
  'local_file': '',
  'dirname': '',
  'query': 'img',
  'savedir':'',
  'attr': 'src'
}

with readfile('setup.json', mode='rb', report=True) as json_file:
  config = {**config,**json.load(json_file)}

site         = (input("Website: ")       or config['site']) # input("Website: ")
newdirname   = (input("Real URL: ")      or config['dirname'])
savedir      = (input("Save location: ") or config['savedir'])
selector     = (input("Query for (any CSS selector for an <img>): ") or config["query"])
replace      = config['replace']
local        = config['local']
local_file   = config['local_file']
attr         = config['attr']

if local:
  with readfile(config['local_file'], 'rb', report=True) as html:
    dom = BeautifulSoup(html, 'html.parser')
else:
  response = makerequest(site)
  html = response.text
  dom = BeautifulSoup(html, 'html.parser')

sources      = [] # store all the images sources

try:
  for element in dom.select(selector):
    # from docs: https://www.crummy.com/software/BeautifulSoup/bs4/doc/#miscellaneous
    if element.get(attr): url = element[attr]
    else: continue

    urltest = re.sub(r'(\?|#).*', '', url)
    # only the supported types by web browsers
    isimg = re.search(r'\.(a?png|p?j(fif|pe?g?)|webp|gif|bmp|svg|avif|tiff?|ico)$', urltest)

    if not isimg: continue
    if not isurl(url): url = makeurl(baseurl=site,path=url) # if there's just the path part

    if replace:
      url = url.replace(replace[0], replace[1])
    sources.append(url)

except Exception:
  pass

def get():
  global savedir
  if not savedir:
    if dom.title: savedir = dom.title.string
    else: savedir = str(uuid4())

  if not os.path.exists(savedir):
    os.makedirs(savedir, exist_ok=True)

  try:
    makerequest(sources=sources, output=savedir)
  except KeyError: pass

get()