Skip to main content

Extract Features

The Extract operator converts web pages into structured, usable data. It provides three powerful methods for working with web content: converting to Markdown, generating JSON schemas, and extracting structured JSON data.

All Extract methods are async and should be used with await.

Overview

The extract operator is accessed through your TABStack client instance:

import asyncio
import os
from tabstack import TABStack

async def main():
async with TABStack(api_key=os.getenv('TABSTACK_API_KEY')) as tabs:
# Access extract methods
await tabs.extract.markdown(url, metadata=False, nocache=False)
await tabs.extract.schema(url, instructions, nocache=False)
await tabs.extract.json(url, schema, nocache=False)

asyncio.run(main())

Extract Markdown

Convert any web page to clean, readable Markdown format.

Basic Usage

import asyncio
import os
from tabstack import TABStack

async def main():
async with TABStack(api_key=os.getenv('TABSTACK_API_KEY')) as tabs:
result = await tabs.extract.markdown('https://example.com/blog/article')
print(result.content)

asyncio.run(main())

With Metadata

Extract additional page metadata (title, description, author, etc.) alongside the content:

async def extract_with_metadata():
async with TABStack(api_key=os.getenv('TABSTACK_API_KEY')) as tabs:
result = await tabs.extract.markdown(
url='https://example.com/blog/article',
metadata=True
)

print('Content:', result.content)
if result.metadata:
print('Title:', result.metadata.title)
print('Description:', result.metadata.description)
print('Author:', result.metadata.author)
print('Image:', result.metadata.image)

asyncio.run(extract_with_metadata())

Bypass Cache

Force a fresh fetch when you need the latest content:

async def extract_fresh():
async with TABStack(api_key=os.getenv('TABSTACK_API_KEY')) as tabs:
result = await tabs.extract.markdown(
url='https://example.com/live-prices',
nocache=True
)
print(result.content)

asyncio.run(extract_fresh())

Response Type

The markdown method returns a MarkdownResponse object:

from dataclasses import dataclass
from typing import Optional

@dataclass
class Metadata:
title: Optional[str]
description: Optional[str]
author: Optional[str]
publisher: Optional[str]
image: Optional[str]
site_name: Optional[str]
url: Optional[str]
type: Optional[str]

@dataclass
class MarkdownResponse:
url: str # Source URL
content: str # Markdown content
metadata: Optional[Metadata] # Optional page metadata

Generate Schema

Automatically generate a JSON schema from any web page using AI.

Basic Usage

async def generate_schema():
async with TABStack(api_key=os.getenv('TABSTACK_API_KEY')) as tabs:
result = await tabs.extract.schema(
url='https://news.ycombinator.com',
instructions='extract top stories with title, points, and author'
)
print(result.schema)

asyncio.run(generate_schema())

Use Generated Schema Immediately

Chain schema generation with extraction:

async def schema_then_extract():
async with TABStack(api_key=os.getenv('TABSTACK_API_KEY')) as tabs:
# Step 1: Generate the schema
schema_result = await tabs.extract.schema(
url='https://news.ycombinator.com',
instructions='extract top 5 stories with title, points, and author'
)

# Step 2: Use it to extract data
data = await tabs.extract.json(
url='https://news.ycombinator.com',
schema=schema_result.schema
)
print(data.data)

asyncio.run(schema_then_extract())

Extract JSON

Extract structured data from web pages that matches your JSON schema.

Basic Usage

async def extract_json_basic():
async with TABStack(api_key=os.getenv('TABSTACK_API_KEY')) as tabs:
schema = {
"type": "object",
"properties": {
"name": {"type": "string"},
"price": {"type": "number"},
"in_stock": {"type": "boolean"}
},
"required": ["name", "price", "in_stock"]
}

result = await tabs.extract.json(
url='https://shop.com/product/123',
schema=schema
)
print(result.data)

asyncio.run(extract_json_basic())

Real-World Examples

Example 1: News Scraping

import asyncio
import os
from tabstack import TABStack

async def scrape_news():
async with TABStack(api_key=os.getenv('TABSTACK_API_KEY')) as tabs:
schema = {
"type": "object",
"properties": {
"articles": {
"type": "array",
"items": {
"type": "object",
"properties": {
"title": {"type": "string"},
"summary": {"type": "string"},
"url": {"type": "string"},
"published_at": {"type": "string"},
"category": {"type": "string"}
},
"required": ["title", "url"]
}
}
}
}

result = await tabs.extract.json('https://news.example.com', schema)

for article in result.data['articles']:
print(f"{article['title']} ({article.get('category', 'N/A')})")
print(f" {article['url']}")
print(f" Published: {article.get('published_at', 'N/A')}")
print()

asyncio.run(scrape_news())

Example 2: E-commerce Product Extraction

async def extract_products():
async with TABStack(api_key=os.getenv('TABSTACK_API_KEY')) as tabs:
schema = {
"type": "object",
"properties": {
"products": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {"type": "string"},
"price": {"type": "number"},
"currency": {"type": "string"},
"in_stock": {"type": "boolean"},
"rating": {"type": "number"}
},
"required": ["name", "price"]
}
}
}
}

result = await tabs.extract.json(
url='https://shop.example.com/category/laptops',
schema=schema
)

# Filter and sort in-stock products by rating
products = result.data['products']
in_stock = [p for p in products if p.get('in_stock', False)]
top_rated = sorted(in_stock, key=lambda x: x.get('rating', 0), reverse=True)[:5]

print('Top 5 In-Stock Products:')
for i, product in enumerate(top_rated, 1):
print(f"{i}. {product['name']} - {product.get('currency', '$')}{product['price']}")
print(f" Rating: {product.get('rating', 'N/A')} stars")

asyncio.run(extract_products())

Example 3: Concurrent Multi-Page Extraction

async def extract_multiple_pages():
urls = [
'https://example.com/products/page-1',
'https://example.com/products/page-2',
'https://example.com/products/page-3'
]

schema = {
"type": "object",
"properties": {
"products": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {"type": "string"},
"price": {"type": "number"}
}
}
}
}
}

async with TABStack(api_key=os.getenv('TABSTACK_API_KEY')) as tabs:
# Extract from all pages concurrently
tasks = [tabs.extract.json(url, schema) for url in urls]
results = await asyncio.gather(*tasks, return_exceptions=True)

all_products = []
for i, result in enumerate(results):
if isinstance(result, Exception):
print(f"Page {i+1} failed: {result}")
else:
all_products.extend(result.data['products'])

print(f"Extracted {len(all_products)} total products")

asyncio.run(extract_multiple_pages())

Options Reference

extract.markdown()

ParameterTypeDefaultDescription
urlstrrequiredURL to convert to markdown
metadataboolFalseInclude page metadata in response
nocacheboolFalseBypass cache and force fresh fetch

extract.schema()

ParameterTypeDefaultDescription
urlstrrequiredURL to analyze for schema generation
instructionsstrrequiredNatural language description of data to extract
nocacheboolFalseBypass cache and force fresh fetch

extract.json()

ParameterTypeDefaultDescription
urlstrrequiredURL to extract data from
schemadictrequiredJSON Schema defining the structure
nocacheboolFalseBypass cache and force fresh fetch

Best Practices

1. Generate Schemas First

# ✅ Good: Generate schema automatically
schema_result = await tabs.extract.schema(
url=url,
instructions='extract product details'
)
schema = schema_result.schema

# ❌ Tedious: Writing complex schemas by hand
schema = { # hundreds of lines }

2. Use Async Context Manager

# ✅ Good: Automatic cleanup
async with TABStack(api_key=api_key) as tabs:
result = await tabs.extract.markdown(url)

# ❌ Not recommended: Manual cleanup
tabs = TABStack(api_key=api_key)
try:
result = await tabs.extract.markdown(url)
finally:
await tabs.close()

3. Handle Errors Properly

# ✅ Good: Error handling
try:
result = await tabs.extract.json(url, schema)
except Exception as error:
print(f"Extraction failed: {error}")
return None

4. Use Connection Pooling for Multiple Requests

# ✅ Good: Concurrent requests with connection pooling
async with TABStack(
api_key=api_key,
max_connections=50
) as tabs:
tasks = [tabs.extract.markdown(url) for url in urls]
results = await asyncio.gather(*tasks)

Next Steps