WebSweep
Installation
Requirements
Install from PyPI
Install from Source (Developers)
User Guide
Quickstart
What each component does
Library Quickstart and Workflow
Common library options (most used)
CLI Workflow (Detailed)
Backend setup (done during
init
)
How CLI configuration works
CLI commands and common options
How
target_temp_folder_path
works
Extractor date windows (how dates are used)
Recurring CLI pattern (every X months)
Custom Extraction Add-ons
URL Filtering Rules
Troubleshooting Statuses
Examples
CLI Examples
Featured Notebook (Parsed)
WebSweep Example for Researchers
websweep
websweep package
Subpackages
Submodules
websweep.config module
websweep.main module
Module contents
Contribute
How to Contribute
Developing WebSweep
Contact and Support
Contact Us
WebSweep
Index
Edit on GitHub
Index
_
|
A
|
B
|
C
|
D
|
E
|
F
|
G
|
I
|
J
|
K
|
M
|
O
|
P
|
R
|
S
|
T
|
W
|
Z
_
_create_results() (websweep.extractor.extractor.Extractor method)
A
address (websweep.consolidator.consolidator.Domain attribute)
,
[1]
append_jsonl() (in module websweep.utils.json_io)
B
btw (websweep.consolidator.consolidator.Domain attribute)
,
[1]
build_tldextract_extractor() (in module websweep.utils.public_suffix)
C
classify_url() (in module websweep.utils.utils)
clean_url() (in module websweep.utils.utils)
cli_config() (in module websweep.main)
consolidate() (in module websweep.main)
(websweep.consolidator.consolidator.Consolidator method)
Consolidator (class in websweep.consolidator.consolidator)
crawl() (in module websweep.main)
crawl_base_urls() (websweep.crawler.crawler.Crawler method)
crawl_complement_base_urls() (websweep.crawler.crawler.Crawler method)
Crawler (class in websweep.crawler.crawler)
create_domain_info() (websweep.consolidator.consolidator.Consolidator method)
create_regex_pattern() (in module websweep.utils.utils)
current_websweep_instance() (in module websweep.config)
D
detect_existing_overview_backend() (in module websweep.utils.backend)
Domain (class in websweep.consolidator.consolidator)
domain (websweep.consolidator.consolidator.Domain attribute)
,
[1]
duckdb_available() (in module websweep.utils.backend)
E
email (websweep.consolidator.consolidator.Domain attribute)
,
[1]
ensure_public_suffix_list() (in module websweep.utils.public_suffix)
extract() (in module websweep.main)
extract_custom_metadata() (websweep.extractor.extractor.FileExtractor method)
extract_default_metadata() (websweep.extractor.extractor.FileExtractor method)
,
[1]
extract_extended_metadata() (websweep.extractor.extractor.FileExtractor method)
extract_urls() (websweep.extractor.extractor.Extractor method)
,
[1]
extracting() (websweep.extractor.extractor.FileExtractor method)
,
[1]
Extractor (class in websweep.extractor.extractor)
F
fax (websweep.consolidator.consolidator.Domain attribute)
,
[1]
FileExtractor (class in websweep.extractor.extractor)
from_dict() (websweep.consolidator.consolidator.Domain class method)
G
get_extractor_addon_file() (in module websweep.config)
get_extractor_delete() (in module websweep.config)
get_source_file_path() (in module websweep.config)
get_target_folder_path() (in module websweep.config)
get_urls() (websweep.crawler.crawler.Crawler method)
get_use_database() (in module websweep.config)
I
identifier (websweep.consolidator.consolidator.Domain attribute)
,
[1]
init() (in module websweep.main)
init_app() (in module websweep.config)
J
json_dumps() (in module websweep.utils.json_io)
json_loads() (in module websweep.utils.json_io)
K
kvk (websweep.consolidator.consolidator.Domain attribute)
,
[1]
M
main() (in module websweep.main)
merge_domain_files() (websweep.consolidator.consolidator.Consolidator method)
module
websweep
websweep.config
websweep.consolidator
websweep.consolidator.consolidator
websweep.crawler
websweep.crawler.crawler
websweep.extractor
websweep.extractor.add_host
websweep.extractor.extractor
websweep.main
websweep.utils
websweep.utils.backend
websweep.utils.json_io
websweep.utils.public_suffix
websweep.utils.source_urls
websweep.utils.utils
O
operate() (in module websweep.main)
P
phone (websweep.consolidator.consolidator.Domain attribute)
,
[1]
R
read_ndjson_in_chunks() (websweep.consolidator.consolidator.Consolidator method)
read_source_urls() (in module websweep.utils.source_urls)
resolve_overview_backend() (in module websweep.utils.backend)
restore() (in module websweep.main)
restore_app() (in module websweep.config)
S
save_orjson_loads() (websweep.consolidator.consolidator.Consolidator method)
set_regex() (in module websweep.utils.utils)
T
text (websweep.consolidator.consolidator.Domain attribute)
,
[1]
to_dict() (websweep.consolidator.consolidator.Domain method)
W
websweep
module
websweep.config
module
websweep.consolidator
module
websweep.consolidator.consolidator
module
websweep.crawler
module
websweep.crawler.crawler
module
websweep.extractor
module
websweep.extractor.add_host
module
websweep.extractor.extractor
module
websweep.main
module
websweep.utils
module
websweep.utils.backend
module
websweep.utils.json_io
module
websweep.utils.public_suffix
module
websweep.utils.source_urls
module
websweep.utils.utils
module
websweep_address() (in module websweep.main)
Z
zipcode (websweep.consolidator.consolidator.Domain attribute)
,
[1]