Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
205 changes: 205 additions & 0 deletions scrapegraph-js/examples/crawl/crawl_with_path_filtering_example.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
/**
* Example of using the crawl endpoint with path filtering.
*
* This example demonstrates how to use includePaths and excludePaths
* to control which pages are crawled on a website.
*/

import { crawl, getCrawlRequest } from 'scrapegraph-js';
import dotenv from 'dotenv';

dotenv.config();

const SGAI_API_KEY = process.env.SGAI_APIKEY || process.env.SGAI_API_KEY;

// Define your output schema
const productSchema = {
type: 'object',
properties: {
products: {
type: 'array',
items: {
type: 'object',
properties: {
name: { type: 'string', description: 'Product name' },
price: { type: 'string', description: 'Product price' },
description: { type: 'string', description: 'Product description' },
category: { type: 'string', description: 'Product category' }
},
required: ['name', 'price']
}
},
total_products: {
type: 'number',
description: 'Total number of products found'
}
},
required: ['products', 'total_products']
};

// Helper function to wait for crawl completion
async function waitForCrawl(taskId, maxAttempts = 60, delay = 5000) {
for (let attempt = 0; attempt < maxAttempts; attempt++) {
await new Promise(resolve => setTimeout(resolve, delay));

const status = await getCrawlRequest(SGAI_API_KEY, taskId);
const state = status.state || 'UNKNOWN';

console.log(`Attempt ${attempt + 1}: Status = ${state}`);

if (state === 'SUCCESS') {
console.log('\n✨ Crawl completed successfully!');
return status.result;
} else if (state === 'FAILURE' || state === 'REVOKED') {
console.log(`\n❌ Crawl failed with status: ${state}`);
throw new Error(`Crawl failed: ${state}`);
}
}

throw new Error('Timeout: Crawl took too long');
}

async function example1() {
console.log('\n📝 Example 1: Crawl only /products/* pages');
console.log('-'.repeat(50));

const result = await crawl(
SGAI_API_KEY,
'https://example.com',
'Extract product information including name, price, and description',
productSchema,
{
depth: 2,
maxPages: 10,
includePaths: ['/products/*', '/items/*'], // Only crawl product pages
excludePaths: ['/products/archived/*'] // But skip archived products
}
);

console.log(`Task ID: ${result.task_id}`);
console.log('\n✅ Crawl job started successfully!');

return result.task_id;
}

async function example2() {
console.log('\n📝 Example 2: Crawl all pages except admin and API');
console.log('-'.repeat(50));

const result = await crawl(
SGAI_API_KEY,
'https://example.com',
'Extract all relevant information from the website',
productSchema,
{
depth: 2,
maxPages: 20,
excludePaths: [
'/admin/*', // Skip all admin pages
'/api/*', // Skip all API endpoints
'/private/*', // Skip private pages
'/*.json' // Skip JSON files
]
}
);

console.log(`Task ID: ${result.task_id}`);
console.log('\n✅ Crawl job started successfully!');

return result.task_id;
}

async function example3() {
console.log('\n📝 Example 3: Complex path filtering with wildcards');
console.log('-'.repeat(50));

const blogSchema = {
type: 'object',
properties: {
posts: {
type: 'array',
items: {
type: 'object',
properties: {
title: { type: 'string' },
author: { type: 'string' },
date: { type: 'string' },
content: { type: 'string' }
}
}
}
}
};

const result = await crawl(
SGAI_API_KEY,
'https://example.com',
'Extract blog posts with title, author, date, and content',
blogSchema,
{
depth: 3,
maxPages: 15,
sitemap: true, // Use sitemap for better coverage
includePaths: [
'/blog/**', // Include all blog pages (any depth)
'/articles/*', // Include top-level articles
'/news/2024/*' // Include 2024 news only
],
excludePaths: [
'/blog/draft/*', // Skip draft blog posts
'/blog/*/comments' // Skip comment pages
]
}
);

console.log(`Task ID: ${result.task_id}`);
console.log('\n✅ Crawl job started successfully!');

return result.task_id;
}

async function main() {
try {
console.log('🔍 Starting crawl with path filtering...');
console.log('='.repeat(50));

// Run example 1
const taskId1 = await example1();

// Run example 2
const taskId2 = await example2();

// Run example 3 and wait for completion
const taskId3 = await example3();

// Optionally wait for one of the crawls to complete
console.log(`\n⏳ Waiting for example 3 to complete (task: ${taskId3})...`);
const result = await waitForCrawl(taskId3);

console.log('\n📊 Crawl Results:');
console.log(JSON.stringify(result, null, 2));

// Print guide
console.log('\n' + '='.repeat(50));
console.log('📚 Path Filtering Guide:');
console.log('='.repeat(50));
console.log('• Use \'/*\' to match a single path segment');
console.log(' Example: \'/products/*\' matches \'/products/item1\' but not \'/products/cat/item1\'');
console.log('\n• Use \'/**\' to match any number of path segments');
console.log(' Example: \'/blog/**\' matches \'/blog/2024/post\' and \'/blog/category/2024/post\'');
console.log('\n• excludePaths takes precedence over includePaths');
console.log(' You can include a broad pattern and exclude specific subsets');
console.log('\n• Paths must start with \'/\'');
console.log(' Example: \'/products/*\' is valid, \'products/*\' is not');
console.log('\n💡 Tips:');
console.log('• Combine with sitemap: true for better page discovery');
console.log('• Use includePaths to focus on content-rich sections');
console.log('• Use excludePaths to skip duplicate or irrelevant content');

} catch (error) {
console.error('❌ Error:', error.message);
process.exit(1);
}
}

main();
12 changes: 11 additions & 1 deletion scrapegraph-js/src/crawl.js
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ import { getMockResponse } from './utils/mockResponse.js';
* @param {boolean} [options.mock] - Override mock mode for this request
* @param {boolean} [options.renderHeavyJs=false] - Whether to render heavy JavaScript on the page
* @param {boolean} [options.stealth=false] - Enable stealth mode to avoid bot detection
* @param {Array<string>} [options.includePaths] - List of path patterns to include (e.g., ['/products/*', '/blog/**']). Supports wildcards: * matches any characters, ** matches any path segments
* @param {Array<string>} [options.excludePaths] - List of path patterns to exclude (e.g., ['/admin/*', '/api/*']). Supports wildcards and takes precedence over includePaths
* @returns {Promise<Object>} The crawl job response
* @throws {Error} Throws an error if the HTTP request fails
*/
Expand All @@ -33,7 +35,7 @@ export async function crawl(
schema,
options = {}
) {
const { mock = null, renderHeavyJs = false, stealth = false } = options;
const { mock = null, renderHeavyJs = false, stealth = false, includePaths = null, excludePaths = null } = options;

// Check if mock mode is enabled
const useMock = mock !== null ? mock : isMockEnabled();
Expand Down Expand Up @@ -88,6 +90,14 @@ export async function crawl(
payload.stealth = stealth;
}

if (includePaths) {
payload.include_paths = includePaths;
}

if (excludePaths) {
payload.exclude_paths = excludePaths;
}

try {
const response = await axios.post(endpoint, payload, { headers });
return response.data;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
"""
Example of using the async crawl endpoint with path filtering.

This example demonstrates how to use include_paths and exclude_paths
to control which pages are crawled on a website (async version).
"""
import asyncio
import os
from scrapegraph_py import AsyncClient
from pydantic import BaseModel, Field


# Define your output schema
class ProductInfo(BaseModel):
name: str = Field(description="Product name")
price: str = Field(description="Product price")
category: str = Field(description="Product category")


class CrawlResult(BaseModel):
products: list[ProductInfo] = Field(description="List of products found")
categories: list[str] = Field(description="List of product categories")


async def main():
# Initialize the async client
sgai_api_key = os.getenv("SGAI_API_KEY")

async with AsyncClient(api_key=sgai_api_key) as client:
print("🔍 Starting async crawl with path filtering...")
print("=" * 50)

# Example: Crawl only product pages, excluding certain sections
print("\n📝 Crawling e-commerce site with smart path filtering")
print("-" * 50)

result = await client.crawl(
url="https://example-shop.com",
prompt="Extract all products with their names, prices, and categories",
data_schema=CrawlResult.model_json_schema(),
extraction_mode=True,
depth=3,
max_pages=50,
sitemap=True, # Use sitemap for better coverage
include_paths=[
"/products/**", # Include all product pages
"/categories/*", # Include category listings
"/collections/*" # Include collection pages
],
exclude_paths=[
"/products/out-of-stock/*", # Skip out-of-stock items
"/products/*/reviews", # Skip review pages
"/admin/**", # Skip admin pages
"/api/**", # Skip API endpoints
"/*.pdf" # Skip PDF files
]
)

print(f"Task ID: {result.get('task_id')}")
print("\n✅ Async crawl job started successfully!")

# You can then poll for results using get_crawl
task_id = result.get('task_id')
if task_id:
print(f"\n⏳ Polling for results (task: {task_id})...")

# Poll every 5 seconds until complete
max_attempts = 60 # 5 minutes max
for attempt in range(max_attempts):
await asyncio.sleep(5)
status = await client.get_crawl(task_id)

state = status.get('state', 'UNKNOWN')
print(f"Attempt {attempt + 1}: Status = {state}")

if state == 'SUCCESS':
print("\n✨ Crawl completed successfully!")
result_data = status.get('result', {})
print(f"Found {len(result_data.get('products', []))} products")
break
elif state in ['FAILURE', 'REVOKED']:
print(f"\n❌ Crawl failed with status: {state}")
break
else:
print("\n⏰ Timeout: Crawl took too long")

print("\n" + "=" * 50)
print("💡 Tips for effective path filtering:")
print("=" * 50)
print("• Combine with sitemap=True for better page discovery")
print("• Use include_paths to focus on content-rich sections")
print("• Use exclude_paths to skip pages with duplicate content")
print("• Test your patterns on a small max_pages first")
print("• Remember: exclude_paths overrides include_paths")


if __name__ == "__main__":
asyncio.run(main())
Loading
Loading