diff --git a/Example/sample.py b/Example/sample.py index 32aa5a4..8c97187 100644 --- a/Example/sample.py +++ b/Example/sample.py @@ -2,7 +2,7 @@ my_downloader = simp.Downloader() -my_downloader.search_urls('Landsapes',limit=10, verbose=True) +my_downloader.search_urls('Landscapes',limit=10, verbose=True) # Get List of Saved URLs in cache print(my_downloader.get_urls()) @@ -13,15 +13,21 @@ # Download + search file my_downloader.download('spaceship', limit=2) -# Now donwload all the Searched picture +# Now download all the Searched picture my_downloader.download(download_cache=True) # Flush cache my_downloader.flush_cache() -# Change Direcotory +# Change Directory my_downloader.directory = 'my_dir/' # Change File extension type my_downloader.extensions = '.jpg' print(my_downloader.extensions) my_downloader.download('laptop', limit=10, verbose=True) + + +# Flush cache +my_downloader.flush_cache() +# Example with Google filters +my_downloader.download('space', limit=10, verbose=True, filters={'size': 'l', 'specific_color': 'orange'}) \ No newline at end of file diff --git a/simple_image_download/simple_image_download.py b/simple_image_download/simple_image_download.py index ae673ca..5bb089d 100644 --- a/simple_image_download/simple_image_download.py +++ b/simple_image_download/simple_image_download.py @@ -9,7 +9,6 @@ import functools import time - ################ # ---> CONSTANTS ################ @@ -18,20 +17,80 @@ GOOGLE_PICTURE_ID = '''&biw=1536&bih=674&tbm=isch&sxsrf=ACYBGNSXXpS6YmAKUiLKKBs6xWb4uUY5gA:1581168823770&source=lnms&sa=X&ved=0ahUKEwioj8jwiMLnAhW9AhAIHbXTBMMQ_AUI3QUoAQ''' HEADERS = { 'User-Agent': - "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36" + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36" } SCANNER_COUNTER = None +VALID_FILTERS = { + 'size': ['l', 'm', 'i'], # large, medium, icon + 'color': ['gray', 'trans'], # black and white, transparent + 'specific_color': ['red', 'orange', 'yellow', 'green', 'teal', 'blue', 'purple', 'pink', 'white', 'gray', 'black', 'brown'], + 'type': ['clipart', 'lineart', 'animated'], + 'time': ['d', 'w', 'm', 'y'], # last 24 hours, last week, last month, last year + 'usage_rights': ['cl', 'ol'] # Creative Commons License, Comercial and other Licenses +} + def generate_search_url(keywords): - keywords_to_search = [str(item).strip() for item in keywords.split(',')][0].split() + keywords_to_search = keywords.split(',') keywords_count = len(keywords_to_search) return keywords_to_search, keywords_count -def generate_urls(search): - """Generates a URLS in the correct format that brings to Google Image seearch page""" - return [(BASE_URL+quote(word)+GOOGLE_PICTURE_ID) for word in search] +def check_if_filters_are_valid(filters): + result = "" + is_valid = True + if 'color' in filters.keys() and 'specific_color' in filters.keys(): + is_valid = False + result = "Cannot have both 'color' and 'specific_color' keys at the same time." + else: + for key, value in filters.items(): + if key in VALID_FILTERS.keys(): + if value not in VALID_FILTERS[key]: + result = "Filter value '{0}' for key '{1}' is not valid. Valid values are: {2}".format(value, key, + VALID_FILTERS[ + key]) + is_valid = False + break + else: + result = "Filter key '{0}' not valid. Valid keys are: {1}".format(quote(key), list(VALID_FILTERS.keys())) + is_valid = False + break + return is_valid, result + + +def generate_filters_string(filters): + is_valid_filters, filter_error = check_if_filters_are_valid(filters) + if is_valid_filters: + filter_str = "&tbs=" + keys_list = list(filters) + for i in range(len(filters.items())): + key = keys_list[i] + value = filters[key] + if key == 'size': + filter_str += "isz:" + value + elif key == 'color': + filter_str += "ic:" + value + elif key == 'specific_color': + filter_str += "ic:specific%2Cisc:" + value + elif key == 'type': + filter_str += "itp:" + value + elif key == 'time': + filter_str += "qdr:" + value + elif key == 'usage_rights': + filter_str += "il:" + value + + if i + 1 < len(filters.items()): + filter_str += "%2C" + return filter_str + else: + print(filter_error) + return "" + + +def generate_urls(search, filters): + """Generates a URLS in the correct format that brings to Google Image search page""" + return [(BASE_URL + quote(word) + filters + GOOGLE_PICTURE_ID) for word in search] def check_webpage(url): @@ -74,6 +133,7 @@ class Downloader: Main Downloader ::param extension:iterable of Files extensions """ + def __init__(self, extensions=None): if extensions: self._extensions = set(*[extensions]) @@ -103,7 +163,6 @@ def extensions(self): def extensions(self, value): self._extensions = set([value]) - def get_urls(self): return [self._cached_urls[url][1].url for url in self._cached_urls] @@ -114,10 +173,13 @@ def _download_page(self, url): resp_data = str(resp.read()) return resp_data - def search_urls(self, keywords, limit=1, verbose=False, cache=True, timer=None): + def search_urls(self, keywords, limit=10, verbose=False, cache=True, timer=None, filters={}): cache_out = {} search, count = generate_search_url(keywords) - urls_ = generate_urls(search) + filters_ = generate_filters_string(filters) + if filters is not {} and filters_ == "": + return + urls_ = generate_urls(search, filters_) timer = timer if timer else 1000 max_progressbar = count * (list(range(limit+1))[-1]+1) bar = progressbar.ProgressBar(maxval=max_progressbar, @@ -146,16 +208,19 @@ def search_urls(self, keywords, limit=1, verbose=False, cache=True, timer=None): self._cached_urls = cache_out if not cache_out: print('==='*15 + ' < ' + 'NO PICTURES FOUND' + ' > ' + '==='*15) + return cache_out - def download(self, keywords=None, limit=1, verbose=False, cache=True, download_cache=False, timer=None): + + def download(self, keywords=None, limit=1, verbose=False, cache=True, download_cache=False, timer=None, filters={}): if not download_cache: - content = self.search_urls(keywords, limit, verbose, cache, timer) + content = self.search_urls(keywords, limit, verbose, cache, timer, filters) else: content = self._cached_urls if not content: print('Downloader has not URLs saved in Memory yet, run Downloader.search_urls to find pics first') for name, (path, url) in content.items(): + name = name.replace(" ", "_") with open(os.path.join(path, name), 'wb') as file: file.write(url.content) if verbose: