Kobo-Crawler/dothething.js at main · samlam369/Kobo-Crawler · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
import { By, Builder } from "selenium-webdriver";
import chrome from 'selenium-webdriver/chrome.js';

/**
 * Extract target links from the given cards.
 * @param {WebElement[]} cards - elements of CSS selector `.card`
 * @returns {Promise<string[]>} - target links
 */
async function extractWeeklySalesLinks(cards) {
    const weeklySalesLinks = [];
    for (const card of cards) {
        const text = await card.getText();
        if (text.includes('【一週99書單】')) {
            const link = await card.findElement(By.css('a.card__link')).getAttribute('href');
            weeklySalesLinks.push(link);
        }
    }
    if (weeklySalesLinks.length === 0) {
        throw new Error('No weekly sales links found. Possible reasons: 1. The page structure has changed; 2. The target text "【一週99書單】" has changed; 3. The latest weekly post is not on page 1 anymore.');
    }
    return weeklySalesLinks;
}


/**
 * Format a date string in the format "M/D" into a string in the format "YYYY-MM-DD"
 * @param {string} dateStr - date string in the format "M/D"
 * @returns {string} - formatted date string in the format "YYYY-MM-DD"
 */
function formatDate(dateStr) {
    const currentYear = new Date().getFullYear();
    const [month, day] = dateStr.split('/').map(num => num.padStart(2, '0'));
    return `${currentYear}-${month}-${day}`;
}

/**
 * Extract daily deals from the given content blocks.
 * @param {WebElement[]} blocks - elements of CSS selector `.book__info`
 * @returns {Promise<Object[]>} - array of daily deals, each as an object with properties:
 *   - date: {string} - date string in the format "YYYY-MM-DD"
 *   - title: {string} - title of the book
 *   - author: {string} - author of the book
 *   - salesCopy: {string} - sales copy of the book
 *   - link: {string} - link to the book page
 *   - bookCover: {string} - link to the book cover image
 */
async function extractDailyDeals(blocks) {
    const dailyDeals = [];
    const booksOf8 = [];
    [0, 2, 4, 6, 8, 10, 12, 14].forEach(i => {
        booksOf8.push([blocks[i], blocks[i + 1]]);
    });
    for (const book of booksOf8) {
        const deal = {};
        const titleLine = await book[0].findElement(By.css('h3')).getText();
        deal.date = formatDate(titleLine.split('週').shift());
        const title = await book[0].findElement(By.css('a')).getText();
        deal.title = title;
        const author = await book[1].findElement(By.css('.author')).getText();
        deal.author = author.slice(2, -2);
        const salesCopy = await book[0].findElement(By.css('p')).getText();
        deal.salesCopy = salesCopy;
        const link = await book[0].findElement(By.css('a')).getAttribute('href');
        const parsedUrl = new URL(link);
        deal.link = parsedUrl.origin + parsedUrl.pathname;
        const bookCover = await book[1].findElement(By.css('img')).getAttribute('src');
        deal.bookCover = bookCover;
        dailyDeals.push(deal);
    }
    return dailyDeals;
}

/**
 * Extract the ISBN (Book ID) from the metadata.
 * @param {WebElement[]} metadata - elements containing book metadata
 * @returns {Promise<string>} - extracted ISBN as a string, or an empty string if not found
 *
 * The metadata is expected to contain lines of text, one of which includes
 * the '書籍ID' identifier followed by the ISBN. This function searches through
 * the metadata to find and return the ISBN.
 */

async function extractISBN(metadata) {
    for (const line of metadata) {
        const text = await line.getText();
        if (text.includes('書籍ID')) {
            return text.split('：').pop().trim();
        }
    }
    // If ISBN not found after checking all lines, return empty string.
    // Errors during getText() or other operations will propagate up.
    return "";
}

(async () => {
    console.log("Starting the job");
    let driver;
    try {
        // Determine whether to load images based on environment variable
        const loadImages = process.env.LOAD_IMAGES !== 'false'; // Default to true if not set or not 'false'
        const chromeOptions = new chrome.Options();
        if (!loadImages) {
            chromeOptions.addArguments('--blink-settings=imagesEnabled=false');
            chromeOptions.addArguments('--disable-images');
            console.log("Image loading disabled.");
        } else {
            console.log("Image loading enabled.");
        }
        driver = await new Builder()
            .forBrowser('chrome')
            .setChromeOptions(chromeOptions)
            .build();
        console.log("Navigating to the Kobo blog page");
        await driver.get('https://www.kobo.com/zh/blog');
        const cards = await driver.findElements(By.css('.card'));
        if (!cards || cards.length === 0) {
            throw new Error('No cards found on the blog page.');
        }
        console.log("Extracting blog posts on the page");
        const weeklySalesLinks = await extractWeeklySalesLinks(cards);
        console.log("Navigating to the latest Weekly Sales link");
        await driver.get(weeklySalesLinks.shift());
        console.log("Latest Weekly Sales:", await driver.getTitle());

        const blocks = await driver.findElements(By.css('.content-block, .book-block'));
        if (!blocks || blocks.length < 16) {
            // Specific check for expected content structure
            throw new Error(`Error collecting post content. Expected at least 16 blocks. Found ${blocks.length}. Please check the blog page structure.`);
        }
        blocks.shift(); // Remove the first element because it's a header
        const dailyDeals = await extractDailyDeals(blocks);

        //Amending dailyDeals with the Kobo Book ID
        for (const deal of dailyDeals) {
            console.log("Navigating to the individual book page to retrieve Book ID: " + deal.title);
            await driver.get(deal.link);
            const metadata = await driver.findElements(By.css('.bookitem-secondary-metadata > ul > li'));
            const isbn = await extractISBN(metadata);
            if (!isbn) {
                // Throw an error if ISBN is not found, as it's considered fatal.
                throw new Error(`Fatal: ISBN not found for book: ${deal.title} (${deal.link})`);
            }
            deal.isbn = isbn;
        }
        console.log("Daily Deals:");
        console.log(JSON.stringify(dailyDeals, null, 2));

    } catch (err) {
        // Centralized error handling
        console.error('An error occurred during script execution:', err);
        // Attempt to quit driver before exiting
        if (driver) {
            try {
                console.log("Attempting to quit the web driver after error...");
                await driver.quit();
            } catch (quitErr) {
                console.error('Error quitting driver after script error:', quitErr);
            }
        }
        process.exit(1); // Exit with error code
    } finally {
        // Final check to ensure driver is quit if it still exists (e.g., error occurred before main try block)
        if (driver) {
            try {
                console.log("Ensuring web driver is quit in finally block...");
                await driver.quit();
            } catch (quitErr) {
                console.error('Error quitting driver in final finally block:', quitErr);
                // Exit even if quit fails in finally, as an error already occurred or script finished.
                if (process.exitCode !== 1) { // Avoid double exit if already exiting due to error
                   process.exit(1);
                }
            }
        }
    }
    console.log("Job finished successfully."); // Log success if no errors caused an exit
})();
// Removed the outer try-catch as the main one now handles initialization errors too.
// The IIFE remains for structure.