Access the data from your crawled website
crawl.output_dir
./webtranspose-output
# get visited_urls visited_urls = crawl.get_visited() visited_urls # ['https://www.example.com', 'https://www.example.com/about'] for x in visited_urls: page_details = crawl.get_page(x)
{ "url": "https://www.example.com", "html": <RAW HTML>, "text": <TEXT FROM HTML OR PDF>, "child_urls": [ "https://www.example.com/about", "https://www.example.com/contact" ], "parent_urls": [ "https://www.example.com", "https://www.example.com/child-page-1" ], }
crawl.download()
import webtranspose as webt url = "https://www.webtranspose.com" crawl = Crawl( url, max_pages=5, ) child_urls = crawl.get_page_child_urls(url)
visited_urls = crawl.get_visited() visited_urls # ['https://www.example.com', 'https://www.example.com/about']
ignored_urls = crawl.get_ignored() ignored_urls # ['https://www.example.com', 'https://www.example.com/about']
queued_urls = crawl.get_queued() queued_urls # ['https://www.example.com', 'https://www.example.com/about']
banned_urls = crawl.get_banned() banned_urls # ['https://www.example.com', 'https://www.example.com/about']