Web Crawler Application

🌏 Introduction

Web crawlers are automated programs that browse the World Wide Web in a methodical and automated manner. They are designed to systematically search through websites and collect data for various purposes, such as indexing web pages, extracting information, or monitoring changes. Web crawlers are commonly used by search engines to gather information about web pages and to provide more relevant search results to users.

In this document, we will take it as an example, that we will use Python to crawl photos of Jade Dragon Snow Mountain on Flickr, along with their timestamps, locations, and publisher information.

🦾QuickStart

Let’s begin with the result so that you will have an overview of Web Crawler.

Captured pictures related to jade snow mountain

Captured pictures related to jade snow mountain

Corresponding information of title, location, uploader and other data for the pictures

Corresponding information of title, location, uploader and other data for the pictures

The core code is listed below:

# to emphasize, code below is unable to operate independently

# set up keywords for searching
keywords = [
    'yulong snow mountain',
    'jade dragon snow mountain']

# recurse to get photos according to keywords
for keyword in keywords:
    photos = flickr.walk(text=keyword,
                         extras='url_o,url_l,url_c,url_z,url_n',
                         per_page=500,
                         sort='relevance')

for i, photo in enumerate(photos):
# recurse to get id and information of each photo
		photo_id = photo.get('id')
	  photo_info = flickr.photos.getInfo(photo_id=photo_id)[0]
# download photo
		downloaded.add(photo_id)
    with open(downloaded_file, 'a') as f:
    f.write(photo_id + '\\n')
# store image
		image = Image.open(BytesIO(response.content))
    image.save(os.path.join('images', f'{keyword.replace(" ", "_")}_{i}.jpg'))
# store info
		info = {
        'title': photo_info.find('title').text if photo_info.find('title') is not None else None,
        'date_taken': photo_info.find('dates').get('taken') if photo_info.find('dates') is not None else None,
		}
		with open('info.csv', 'a', newline='', encoding='utf-8') as csvfile:
		    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
		    writer.writerow(info)

🧑🏻‍💻 Development

🤖 Applying AI Tools

In this era, AI tools can assist us to great extend in developing Web Crawler, but we still have to provide them with appropriate prompts, improve the code and perform debugging locally ourselves. Here is a demonstration of the conversation between me and ChatGPT, and how I step-by-step complete the entire code.

💻 Code

import os
import requests
import csv
from PIL import Image
from io import BytesIO
from flickrapi import FlickrAPI
from requests.exceptions import ChunkedEncodingError, Timeout

MAX_RETRIES = 3
DOWNLOAD_TIMEOUT = 5

KEY = '832b47cbeb0eab46c9c16129da30ca56'  # Flickr API key
SECRET = 'aa18bf431b150f15'  # Flickr API secret
flickr = FlickrAPI(KEY, SECRET, format='etree')  # 建立一个FlickrAPI对象，使用ElementTree格式

print("Flickr API key and secret are set.")

# 定义关键字
keywords = [
    'jade dragon snow mountain',
    'yulong snow mountain',
    'Blue Moon Valley',
    'Baisha Village',
    'Ganhaizi Meadow'
]

print(f"Keywords are set to: {keywords}")

# 如果不存在，创建保存图片和信息的目录
if not os.path.exists('images'):
    os.makedirs('images')

print("Image directory is ready.")

# 存储已经下载的图片ID
downloaded_file = 'downloaded.txt'
if os.path.exists(downloaded_file):
    with open(downloaded_file, 'r') as f:
        downloaded = set(line.strip() for line in f)
else:
    downloaded = set()

# 如果存在，则读取照片编号文件，否则创建一个新的文件并开始编号为0
photo_number_file = 'photo_number.txt'
if os.path.exists(photo_number_file):
    with open(photo_number_file, 'r') as f:
        photo_number = int(f.read().strip())
else:
    photo_number = 0

print(f"Starting photo number is {photo_number}.")

fieldnames = ['photo_number', 'keyword', 'title', 'date_taken', 'latitude', 'longitude', 'description', 'owner', 'realname', 'location']
# 如果不存在，创建并写入CSV列名
if not os.path.exists('info.csv'):
    with open('info.csv', 'w', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

print("CSV file is ready.")

# 对每一个关键字进行搜索
for keyword in keywords:
    print(f"Searching for keyword: {keyword}")

    photos = flickr.walk(text=keyword,
                         extras='url_c,url_l,url_o,url_z,url_n',  # 获取各种尺寸的URL
                         per_page=500,
                         sort='relevance')

    # 取前x个结果
    for i, photo in enumerate(photos):
        if keyword == 'yulong snow mountain' or keyword == 'jade dragon snow mountain':
            if i >= 2000:
                break
        elif i >= 400:
            break

        photo_id = photo.get('id')
        photo_info = flickr.photos.getInfo(photo_id=photo_id)[0]

        if photo_id in downloaded:
            print(f"Skipping photo {photo_id} as it has already been downloaded.")
            continue

        # 获取图片的URL
        for size in ['url_o', 'url_l', 'url_c', 'url_z', 'url_n']:  # 按照优先级顺序检查
            if size in photo.keys():
                url = photo.get(size)

                for _ in range(MAX_RETRIES):
                    try:
                        response = requests.get(url, timeout=DOWNLOAD_TIMEOUT)
                    except Timeout:
                        print(f"Download timed out for image {photo_id}, skipping...")
                        break
                    except ChunkedEncodingError:
                        print(f"Failed to download image {photo_id} due to ChunkedEncodingError, retrying...")
                        continue
                    else:
                        break
                else:
                    print(f"Failed to download image {photo_id} after {MAX_RETRIES} attempts, skipping...")
                    continue

                # 保存图片
                try:
                    image = Image.open(BytesIO(response.content))
                    image.save(os.path.join('images', f'{photo_number}.jpg'))
                    photo_number += 1
                    with open(photo_number_file, 'w') as f:
                        f.write(str(photo_number) + '\\n')

                    print(f"Image {photo_id} saved successfully.")
                except Exception as e:
                    print(f"Unable to save image {photo_id} due to {str(e)}")
                break  # 如果找到可用的尺寸，就不再检查更小的尺寸

        # 在保存图片信息后，将图片ID添加到已下载的集合中，并保存到文件
        downloaded.add(photo_id)
        with open(downloaded_file, 'a') as f:
            f.write(photo_id + '\\n')

        print(f"Added photo {photo_id} to downloaded set.")

        # 获取用户信息
        user_id = photo_info.find('owner').get('nsid')
        user_info = flickr.people.getInfo(user_id=user_id)[0]

        # 保存图片信息
        info = {
            'photo_number': photo_number,
            'keyword': keyword,
            'title': photo_info.find('title').text if photo_info.find('title') is not None else None,
            'date_taken': photo_info.find('dates').get('taken') if photo_info.find('dates') is not None else None,
            'latitude': photo_info.find('location').get('latitude') if photo_info.find(
                'location') is not None else None,
            'longitude': photo_info.find('location').get('longitude') if photo_info.find(
                'location') is not None else None,
            'description': photo_info.find('description').text if photo_info.find('description') is not None else None,
            'owner': photo_info.find('owner').get('username') if photo_info.find('owner') is not None else None,
            'realname': user_info.find('realname').text if user_info.find('realname') is not None else None,
            'location': user_info.find('location').text if user_info.find('location') is not None else None,
        }

        with open('info.csv', 'a', newline='', encoding='utf-8') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writerow(info)

        print(f"Image {photo_id} information saved successfully.")
        print(f"{photo_number} images has been downloaded.")