Media Post Scraping¶

Extract and download media content from tweets including photos, videos, GIFs, and high-quality image variants.

Overview¶

The media scraper retrieves visual content from tweets, enabling archival, analysis, and content curation. It supports downloading photos at various resolutions, videos in multiple qualities, and animated GIFs.

Use Cases¶

Content Archival: Preserve media before potential deletion
Asset Collection: Build image libraries for inspiration
Research: Analyze visual content trends
Backup: Archive your own media posts
Curation: Collect media for newsletters or compilations

Basic Usage¶

import asyncio
from xeepy import Xeepy

async def scrape_media():
    async with Xeepy() as x:
        # Get media posts from a user
        media_tweets = await x.scrape.media("nasa", limit=50)

        for tweet in media_tweets:
            print(f"Tweet: {tweet.text[:50]}...")
            for media in tweet.media:
                print(f"  Type: {media.type}")
                print(f"  URL: {media.url}")

asyncio.run(scrape_media())

Downloading Media Files¶

async def download_media():
    async with Xeepy() as x:
        # Download media from specific tweets
        result = await x.media.download(
            tweet_ids=["123456789", "987654321"],
            output_dir="downloads/media",
            photos=True,
            videos=True,
            gifs=True,
            hq_images=True  # Highest quality images
        )

        print(f"Downloaded {len(result.files)} files")
        for file in result.files:
            print(f"  {file.filename}: {file.size_mb:.1f} MB")

asyncio.run(download_media())

Scraping User Media Gallery¶

async def scrape_user_gallery():
    async with Xeepy() as x:
        # Get all media from a user's media tab
        media_tweets = await x.scrape.media(
            username="natgeo",
            limit=200,
            media_type="all",       # all, photos, videos
            include_retweets=False  # Original content only
        )

        # Categorize by media type
        photos = []
        videos = []
        gifs = []

        for tweet in media_tweets:
            for media in tweet.media:
                if media.type == "photo":
                    photos.append(media)
                elif media.type == "video":
                    videos.append(media)
                elif media.type == "animated_gif":
                    gifs.append(media)

        print(f"Photos: {len(photos)}")
        print(f"Videos: {len(videos)}")
        print(f"GIFs: {len(gifs)}")

asyncio.run(scrape_user_gallery())

Batch Media Download¶

async def batch_download():
    async with Xeepy() as x:
        users = ["nasa", "natgeo", "bbcearth"]

        for username in users:
            print(f"\nProcessing @{username}...")

            # Download user's media
            paths = await x.media.download_user_media(
                username=username,
                output_dir=f"media/{username}",
                limit=100,
                photos=True,
                videos=True,
                hq_images=True
            )

            print(f"  Downloaded {len(paths)} files")

asyncio.run(batch_download())

Video Quality Options¶

async def download_videos():
    async with Xeepy() as x:
        # Download videos with quality preference
        result = await x.media.download(
            tweet_ids=["123456789"],
            output_dir="videos",
            videos=True,
            video_quality="highest",  # highest, 720p, 480p, lowest
            include_thumbnail=True    # Save video thumbnails
        )

        for file in result.files:
            if file.type == "video":
                print(f"Video: {file.filename}")
                print(f"  Resolution: {file.width}x{file.height}")
                print(f"  Duration: {file.duration_seconds}s")
                print(f"  Size: {file.size_mb:.1f} MB")

asyncio.run(download_videos())

Configuration Options¶

Parameter	Type	Default	Description
`username`	str	required	Target username
`limit`	int	50	Maximum media tweets
`media_type`	str	"all"	all, photos, videos
`include_retweets`	bool	True	Include retweeted media
`output_dir`	str	"media"	Download directory
`photos`	bool	True	Download photos
`videos`	bool	True	Download videos
`gifs`	bool	True	Download GIFs
`hq_images`	bool	False	Highest quality images
`video_quality`	str	"highest"	Video quality preference

High Quality Images

Use hq_images=True to download the largest available image variants. Twitter stores multiple resolutions; this option fetches the original upload quality.

Video Limitations

Some videos may have DRM or playback restrictions. The scraper downloads the best available quality that's publicly accessible.

Image Quality Variants¶

async def get_image_variants():
    async with Xeepy() as x:
        media_tweets = await x.scrape.media("username", limit=10)

        for tweet in media_tweets:
            for media in tweet.media:
                if media.type == "photo":
                    print(f"\nImage variants:")
                    print(f"  Thumb: {media.thumb_url}")
                    print(f"  Small: {media.small_url}")
                    print(f"  Medium: {media.medium_url}")
                    print(f"  Large: {media.large_url}")
                    print(f"  Original: {media.original_url}")

asyncio.run(get_image_variants())

Media Metadata Extraction¶

async def extract_media_metadata():
    async with Xeepy() as x:
        media_tweets = await x.scrape.media("username", limit=50)

        for tweet in media_tweets:
            print(f"\nTweet ID: {tweet.id}")
            print(f"Posted: {tweet.created_at}")

            for media in tweet.media:
                print(f"\n  Media ID: {media.id}")
                print(f"  Type: {media.type}")
                print(f"  Dimensions: {media.width}x{media.height}")

                if media.type == "video":
                    print(f"  Duration: {media.duration_ms}ms")
                    print(f"  Views: {media.view_count}")

                # Alt text if available
                if media.alt_text:
                    print(f"  Alt text: {media.alt_text}")

asyncio.run(extract_media_metadata())

Organizing Downloaded Media¶

async def organized_download():
    async with Xeepy() as x:
        from datetime import datetime

        media_tweets = await x.scrape.media("username", limit=100)

        for tweet in media_tweets:
            # Organize by date and type
            date_str = tweet.created_at.strftime("%Y/%m")

            for media in tweet.media:
                type_dir = media.type  # photo, video, animated_gif
                output_path = f"media/{date_str}/{type_dir}"

                await x.media.download_single(
                    media_url=media.original_url,
                    output_dir=output_path,
                    filename=f"{tweet.id}_{media.id}"
                )

asyncio.run(organized_download())

Best Practices¶

Respect Copyright: Only download media you have rights to use
Use HQ Sparingly: High-quality downloads consume more bandwidth and storage
Organize by Date: Structure directories by date for easy navigation
Check File Sizes: Videos can be large; monitor disk space
Handle Errors: Some media may be unavailable; implement error handling
Rate Limiting: Space out bulk downloads to avoid rate limits