Depth-Controlled Media Crawler with Cordova & Fetch API
Depth-Controlled Media Crawler with Cordova & Fetch API
Couldn't load pickup availability
Boost your web crawling capabilities with our enhanced JavaScript crawler designed for Cordova applications. This tool features depth-controlled crawling by queuing each URL with a specified recursion depth, ensuring your crawl doesn't go too deep. With built-in progress tracking and integrated media downloading, you can fetch video and audio files seamlessly using the modern fetch API and Cordova File plugin.
Key Features:
- Depth-controlled crawling with a customizable maximum depth (default set to 3)
- Synchronous, unbounded recursive crawl for orderly page processing
- Real-time display of pages crawled
- Media download functionality supporting video and audio extraction
- Error handling and progress updates using a simple on-page info element
Included code snippet demonstrates the integration of a recursive queue, media downloading using Cordova File plugin, and dynamic updating of crawl progress. Whether you’re a developer or a marketer looking for robust web crawling functionality, this solution is well-equipped to handle the task.
"use strict";
// Wait for device to be ready before starting
document.addEventListener("deviceready", onDeviceReady, false);
function onDeviceReady() {
document.getElementById("status").textContent = "Device Ready – starting crawl…";
var baseUrl = "https://example.com";
var maxDepth = 3;
startCrawling(baseUrl, baseUrl, maxDepth);
}
// Start recursive crawl with depth control
function startCrawling(baseUrl, startUrl, maxDepth) {
var pageQueue = [{ url: startUrl, depth: 0 }];
var visitedPages = new Set();
var pagesCrawledCount = 0;
function crawlNext() {
if (pageQueue.length === 0) {
document.getElementById("status").textContent = "Crawl complete. Pages crawled: " + pagesCrawledCount;
return;
}
var pageInfo = pageQueue.shift();
var currentUrl = pageInfo.url;
var currentDepth = pageInfo.depth;
if (visitedPages.has(currentUrl)) {
crawlNext();
return;
}
visitedPages.add(currentUrl);
pagesCrawledCount++;
updateCrawlInfo(pagesCrawledCount);
fetch(currentUrl)
.then(response => response.text())
.then(function(htmlString) {
var parser = new DOMParser();
var doc = parser.parseFromString(htmlString, "text/html");
var anchors = doc.querySelectorAll("a");
anchors.forEach(function(a) {
var href = a.href;
if (href && href.indexOf(baseUrl) === 0 && !visitedPages.has(href) && currentDepth < maxDepth) {
pageQueue.push({ url: href, depth: currentDepth + 1 });
}
});
// Process media downloading ...
})
.catch(function(err) {
console.error("Error crawling " + currentUrl + ":", err);
})
.finally(crawlNext);
}
crawlNext();
}
Share
