Extract Features
The Extract operator converts web pages into structured, usable data. It provides three powerful methods for working with web content: converting to Markdown, generating JSON schemas, and extracting structured JSON data.
All Extract methods are async and should be used with await.
Overview
The extract operator is accessed through your TABStack client instance:
import asyncio
import os
from tabstack import TABStack
async def main():
async with TABStack(api_key=os.getenv('TABSTACK_API_KEY')) as tabs:
# Access extract methods
await tabs.extract.markdown(url, metadata=False, nocache=False)
await tabs.extract.schema(url, instructions, nocache=False)
await tabs.extract.json(url, schema, nocache=False)
asyncio.run(main())
Extract Markdown
Convert any web page to clean, readable Markdown format.
Basic Usage
import asyncio
import os
from tabstack import TABStack
async def main():
async with TABStack(api_key=os.getenv('TABSTACK_API_KEY')) as tabs:
result = await tabs.extract.markdown('https://example.com/blog/article')
print(result.content)
asyncio.run(main())
With Metadata
Extract additional page metadata (title, description, author, etc.) alongside the content:
async def extract_with_metadata():
async with TABStack(api_key=os.getenv('TABSTACK_API_KEY')) as tabs:
result = await tabs.extract.markdown(
url='https://example.com/blog/article',
metadata=True
)
print('Content:', result.content)
if result.metadata:
print('Title:', result.metadata.title)
print('Description:', result.metadata.description)
print('Author:', result.metadata.author)
print('Image:', result.metadata.image)
asyncio.run(extract_with_metadata())
Bypass Cache
Force a fresh fetch when you need the latest content:
async def extract_fresh():
async with TABStack(api_key=os.getenv('TABSTACK_API_KEY')) as tabs:
result = await tabs.extract.markdown(
url='https://example.com/live-prices',
nocache=True
)
print(result.content)
asyncio.run(extract_fresh())
Response Type
The markdown method returns a MarkdownResponse object:
from dataclasses import dataclass
from typing import Optional
@dataclass
class Metadata:
title: Optional[str]
description: Optional[str]
author: Optional[str]
publisher: Optional[str]
image: Optional[str]
site_name: Optional[str]
url: Optional[str]
type: Optional[str]
@dataclass
class MarkdownResponse:
url: str # Source URL
content: str # Markdown content
metadata: Optional[Metadata] # Optional page metadata
Generate Schema
Automatically generate a JSON schema from any web page using AI.
Basic Usage
async def generate_schema():
async with TABStack(api_key=os.getenv('TABSTACK_API_KEY')) as tabs:
result = await tabs.extract.schema(
url='https://news.ycombinator.com',
instructions='extract top stories with title, points, and author'
)
print(result.schema)
asyncio.run(generate_schema())
Use Generated Schema Immediately
Chain schema generation with extraction:
async def schema_then_extract():
async with TABStack(api_key=os.getenv('TABSTACK_API_KEY')) as tabs:
# Step 1: Generate the schema
schema_result = await tabs.extract.schema(
url='https://news.ycombinator.com',
instructions='extract top 5 stories with title, points, and author'
)
# Step 2: Use it to extract data
data = await tabs.extract.json(
url='https://news.ycombinator.com',
schema=schema_result.schema
)
print(data.data)
asyncio.run(schema_then_extract())
Extract JSON
Extract structured data from web pages that matches your JSON schema.
Basic Usage
async def extract_json_basic():
async with TABStack(api_key=os.getenv('TABSTACK_API_KEY')) as tabs:
schema = {
"type": "object",
"properties": {
"name": {"type": "string"},
"price": {"type": "number"},
"in_stock": {"type": "boolean"}
},
"required": ["name", "price", "in_stock"]
}
result = await tabs.extract.json(
url='https://shop.com/product/123',
schema=schema
)
print(result.data)
asyncio.run(extract_json_basic())
Real-World Examples
Example 1: News Scraping
import asyncio
import os
from tabstack import TABStack
async def scrape_news():
async with TABStack(api_key=os.getenv('TABSTACK_API_KEY')) as tabs:
schema = {
"type": "object",
"properties": {
"articles": {
"type": "array",
"items": {
"type": "object",
"properties": {
"title": {"type": "string"},
"summary": {"type": "string"},
"url": {"type": "string"},
"published_at": {"type": "string"},
"category": {"type": "string"}
},
"required": ["title", "url"]
}
}
}
}
result = await tabs.extract.json('https://news.example.com', schema)
for article in result.data['articles']:
print(f"{article['title']} ({article.get('category', 'N/A')})")
print(f" {article['url']}")
print(f" Published: {article.get('published_at', 'N/A')}")
print()
asyncio.run(scrape_news())
Example 2: E-commerce Product Extraction
async def extract_products():
async with TABStack(api_key=os.getenv('TABSTACK_API_KEY')) as tabs:
schema = {
"type": "object",
"properties": {
"products": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {"type": "string"},
"price": {"type": "number"},
"currency": {"type": "string"},
"in_stock": {"type": "boolean"},
"rating": {"type": "number"}
},
"required": ["name", "price"]
}
}
}
}
result = await tabs.extract.json(
url='https://shop.example.com/category/laptops',
schema=schema
)
# Filter and sort in-stock products by rating
products = result.data['products']
in_stock = [p for p in products if p.get('in_stock', False)]
top_rated = sorted(in_stock, key=lambda x: x.get('rating', 0), reverse=True)[:5]
print('Top 5 In-Stock Products:')
for i, product in enumerate(top_rated, 1):
print(f"{i}. {product['name']} - {product.get('currency', '$')}{product['price']}")
print(f" Rating: {product.get('rating', 'N/A')} stars")
asyncio.run(extract_products())
Example 3: Concurrent Multi-Page Extraction
async def extract_multiple_pages():
urls = [
'https://example.com/products/page-1',
'https://example.com/products/page-2',
'https://example.com/products/page-3'
]
schema = {
"type": "object",
"properties": {
"products": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {"type": "string"},
"price": {"type": "number"}
}
}
}
}
}
async with TABStack(api_key=os.getenv('TABSTACK_API_KEY')) as tabs:
# Extract from all pages concurrently
tasks = [tabs.extract.json(url, schema) for url in urls]
results = await asyncio.gather(*tasks, return_exceptions=True)
all_products = []
for i, result in enumerate(results):
if isinstance(result, Exception):
print(f"Page {i+1} failed: {result}")
else:
all_products.extend(result.data['products'])
print(f"Extracted {len(all_products)} total products")
asyncio.run(extract_multiple_pages())
Options Reference
extract.markdown()
| Parameter | Type | Default | Description |
|---|---|---|---|
url | str | required | URL to convert to markdown |
metadata | bool | False | Include page metadata in response |
nocache | bool | False | Bypass cache and force fresh fetch |
extract.schema()
| Parameter | Type | Default | Description |
|---|---|---|---|
url | str | required | URL to analyze for schema generation |
instructions | str | required | Natural language description of data to extract |
nocache | bool | False | Bypass cache and force fresh fetch |
extract.json()
| Parameter | Type | Default | Description |
|---|---|---|---|
url | str | required | URL to extract data from |
schema | dict | required | JSON Schema defining the structure |
nocache | bool | False | Bypass cache and force fresh fetch |
Best Practices
1. Generate Schemas First
# ✅ Good: Generate schema automatically
schema_result = await tabs.extract.schema(
url=url,
instructions='extract product details'
)
schema = schema_result.schema
# ❌ Tedious: Writing complex schemas by hand
schema = { # hundreds of lines }
2. Use Async Context Manager
# ✅ Good: Automatic cleanup
async with TABStack(api_key=api_key) as tabs:
result = await tabs.extract.markdown(url)
# ❌ Not recommended: Manual cleanup
tabs = TABStack(api_key=api_key)
try:
result = await tabs.extract.markdown(url)
finally:
await tabs.close()
3. Handle Errors Properly
# ✅ Good: Error handling
try:
result = await tabs.extract.json(url, schema)
except Exception as error:
print(f"Extraction failed: {error}")
return None
4. Use Connection Pooling for Multiple Requests
# ✅ Good: Concurrent requests with connection pooling
async with TABStack(
api_key=api_key,
max_connections=50
) as tabs:
tasks = [tabs.extract.markdown(url) for url in urls]
results = await asyncio.gather(*tasks)
Next Steps
- Generate Features: Transform and analyze extracted data with AI
- Automate Features: Execute complex browser automation tasks
- Error Handling: Build robust applications with proper error handling
- REST API Reference: See the underlying REST API endpoints