Free Python Script for Extracting Content Publish Dates


!pip install beautifulsoup4

import csv
import requests
from bs4 import BeautifulSoup
from datetime import datetime

def fetch_published_date(url):
    """
    Fetches the published date of an article from its meta properties.

    Parameters:
    - url (str): The URL of the web page.

    Returns:
    - str: The published date in 'YYYY-MM-DD' format if available, else an empty string.
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            meta_tag = soup.find('meta', property='article:published_time')
            if meta_tag and 'content' in meta_tag.attrs:
                published_time = meta_tag.attrs['content']
                published_date = published_time.split('T')[0]  # Extracts date part
                return published_date
        return ""
    except Exception as e:
        return ""

# Assuming 'urls.tsv' is the file where each line is a URL
urls_file = 'urls.tsv'
results = []

with open(urls_file, 'r') as file:
    tsv_reader = csv.reader(file, delimiter='\t')
    for row in tsv_reader:
        url = row[0]  # Assuming each row contains a URL in the first column
        published_date = fetch_published_date(url)
        results.append((url, published_date))

# Save the results to a new TSV file
output_file = 'published_dates.tsv'

with open(output_file, 'w', newline='', encoding='utf-8') as file:
    tsv_writer = csv.writer(file, delimiter='\t')
    tsv_writer.writerow(['URL', 'Published Date'])  # Header
    for url, date in results:
        tsv_writer.writerow([url, date])

View

Free Python Script for Extracting Content Publish Dates

Atiqur R. Ashik

Get in Touch

Let’s Bring Your Vision to Life!

View

Free Python Script for Extracting Content Publish Dates

Atiqur R. Ashik

Subscribe to Our Newsletter:

Heading

Heading

Get in Touch

Let’s Bring Your Vision to Life!