Crawl Data Website Using Nodejs

Hòa Nguyễn Coder - Apr 12 '21 - - Dev Community

Today, I'm do simple example "Crawl data web using Nodejs". I want call to URL, after I download all image in URL save to directory

I using library below

npm install request-promise cheerio request fs
Enter fullscreen mode Exit fullscreen mode
  • request-promise : using Call to URL
  • cheerio : Dom element html, we can use jquery get value (id,class)
  • fs : we can read file in directory

Okay, after we setup project success, we need create file and foler following below

  • create dslink.txt file in directory, crawl list url
  • create images folder in directory , using save image
  • create data.json, save info crawl url

Okay, create index.js file

const rp = require("request-promise");
const cheerio = require("cheerio");
const request = require('request');
const fs = require("fs");

function sleep(ms) {
    return new Promise(resolve => setTimeout(resolve, ms));
}

const dslink = "dslink.txt";

var arrayLink = fs.readFileSync(dslink).toString().split("\n");

async function crawler() {
    await sleep(1000);
    for (i in arrayLink) {
        const linkchay = arrayLink[i];

        try {
            const options = {
                uri: linkchay,
                transform: function (body) {

                    return cheerio.load(body);
                },
            };
            var $ = await rp(options);
        } catch (error) {
            console.log("Link dang dung:" + arrayLink[i]);
            return error;
        }

        const title = $(".ten_title").text().trim();
        const tableContent = $(".info_content");
        let data = [];
        // Tên của chương đó.
        let chaperTitle = tableContent.find("p").text().trim();
        let namefile = "";
        let chaperData = []
        const chaperLink = tableContent.find("p").find("img");

        for (let j = 0; j < chaperLink.length; j++) {
            const post = $(chaperLink[j]);
            const postLink = post.attr("src");

            const n = postLink.lastIndexOf("/");

            const filename = postLink.substring(n + 1, postLink.length); 
            namefile = filename;

            download(postLink, filename, function () {
                //console.log("Link:"+linkchay);
            });
            const postTitle = post.text().trim();
            chaperData.push({
                postTitle,
                linkchay,
                filename,
            });
        }
        data.push({
            chaperTitle,
            chaperData,

        });

        fs.writeFileSync('data.json', JSON.stringify(data))
        console.log(linkchay + "------------->done");
        await sleep(1000);
    }

};
//call crawler
crawler();

//call download file
var download = function (uri, filename, callback) {
    request.head(uri, function (err, res, body) {
        console.log('content-type:', res.headers['content-type']);
        console.log('content-length:', res.headers['content-length']);

        request(uri).pipe(fs.createWriteStream('./images/' + filename)).on('close', callback);
    });
};
Enter fullscreen mode Exit fullscreen mode

The post: Crawl Data Website Using Nodejs

. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
Terabox Video Player