cloudflare:hackerNewsHome

Extract top 10 links from Hacker News homepage

Published August 5, 2025
Source Code
// TASK: hackerNewsHome
// Run this task with:
// forge task:run cloudflare:hackerNewsHome

import axios from 'axios';
import { createTask } from '@forgehive/task'
import { Schema } from '@forgehive/schema'
import { NewsModel } from '../../models/index.js'

const name = 'cloudflare:hackerNewsHome'
const description = 'Extract top 30 links from Hacker News homepage'

const schema = new Schema({})

interface ElementResult {
  selector: string;
  results: Array<{
    text?: string;
    attributes?: Array<{
      name: string;
      value: string;
    }>;
  }>;
}

interface HackerNewsItem {
  title: string;
  link: string;
  commentLink: string;
}

interface ElementAttributes {
  name: string;
  value: string;
}

interface ElementResultItem {
  text?: string;
  attributes?: ElementAttributes[];
}

const boundaries = {
  scrapeHackerNews: async () => {
    const apiToken = process.env.CLOUDFLARE_API_TOKEN
    const accountId = process.env.CLOUDFLARE_ACCOUNT_ID

    if (!apiToken || !accountId) {
      throw new Error('Missing Cloudflare credentials')
    }

    try {
      const response = await axios.post(
        `https://api.cloudflare.com/client/v4/accounts/${accountId}/browser-rendering/scrape`,
        {
          url: 'https://news.ycombinator.com',
          elements: [
            {
              selector: "tr.athing"
            },
            {
              selector: "span.titleline > a"
            },
            {
              selector: "span.subline a[href*='item?id=']"
            },
            {
              selector: "span.subline a[href*='user?id=']"
            },
            {
              selector: "span.rank"
            }
          ]
        },
        {
          headers: {
            'Content-Type': 'application/json',
            'Authorization': `Bearer ${apiToken}`
          }
        }
      )

      return response.data
    } catch (error: unknown) {
      const errorMessage = error instanceof Error ? error.message : 'Unknown error';
      throw new Error(`Failed to scrape Hacker News: ${errorMessage}`)
    }
  },

  saveNewsItem: async (newsItem: { title: string; url: string; submitter: string; hackerNewsId: string; rank: number }) => {
    try {
      const now = new Date();

      // Check if item already exists by hackerNewsId
      const existingItem = await NewsModel.findOne({ hackerNewsId: newsItem.hackerNewsId });

      if (existingItem) {
        // Always update lastSeen when we encounter the item
        const updateFields: any = { lastSeen: now };

        // Check if rank has changed
        if (existingItem.rank !== newsItem.rank) {
          updateFields.rank = newsItem.rank;
          updateFields.updatedAt = now;

          await NewsModel.updateOne(
            { hackerNewsId: newsItem.hackerNewsId },
            updateFields
          );
          console.log(`Updated rank for "${newsItem.title}": ${existingItem.rank} -> ${newsItem.rank}`);
          return 'updated'; // Return 'updated' when rank changes
        } else {
          // Just update lastSeen (rank is the same)
          await NewsModel.updateOne(
            { hackerNewsId: newsItem.hackerNewsId },
            updateFields
          );
          console.log(`News item seen again with same rank: ${newsItem.title}`);
          return 'skipped'; // Return 'skipped' for items with same rank
        }
      }

      // Create new item (ingestedAt and lastSeen are the same for new items)
      const savedItem = await NewsModel.create({
        title: newsItem.title,
        url: newsItem.url,
        submitter: newsItem.submitter,
        hackerNewsId: newsItem.hackerNewsId,
        rank: newsItem.rank,
        ingestedAt: now,
        lastSeen: now
      });

      return 'saved'; // Return 'saved' for new items
    } catch (error: unknown) {
      const errorMessage = error instanceof Error ? error.message : 'Unknown error';
      throw new Error(`Failed to save news item: ${errorMessage}`);
    }
  }
}

export const hackerNewsHome = createTask({
  name,
  description,
  schema,
  boundaries,
  fn: async function (argv, { scrapeHackerNews, saveNewsItem }) {
    try {
      // Scrape the Hacker News homepage
      const scrapeResult = await scrapeHackerNews()

      if (!scrapeResult || !scrapeResult.result) {
        throw new Error('No scrape results found')
      }

      // Extract title links
      const titleLinks: Array<{text: string, href: string}> = []
      const titleResult = scrapeResult.result.find((item: ElementResult) => item.selector === "span.titleline > a")
      if (titleResult && titleResult.results) {
        titleResult.results.forEach((result: ElementResultItem) => {
          if (result.text && result.attributes) {
            const hrefAttr = result.attributes.find((attr: ElementAttributes) => attr.name === 'href')
            if (hrefAttr) {
              titleLinks.push({
                text: result.text,
                href: hrefAttr.value
              })
            }
          }
        })
      }

      // Extract comment links (skip time links, take only comment/discuss links)
      const commentLinks: Array<string> = []
      const commentResult = scrapeResult.result.find((item: ElementResult) => item.selector === "span.subline a[href*='item?id=']")

      if (commentResult && commentResult.results) {
        // Skip every other link (time links) and take only comment links
        commentResult.results.forEach((result: ElementResultItem, index: number) => {
          // Take every second link starting from index 1 (0=time, 1=comments, 2=time, 3=comments, etc.)
          if (index % 2 === 1 && result.attributes) {
            const hrefAttr = result.attributes.find((attr: ElementAttributes) => attr.name === 'href')
            if (hrefAttr && hrefAttr.value.includes('item?id=')) {
              commentLinks.push(`https://news.ycombinator.com/${hrefAttr.value}`)
            }
          }
        })
      }

      // Extract submitters
      const submitters: Array<string> = []
      const submitterResult = scrapeResult.result.find((item: ElementResult) => item.selector === "span.subline a[href*='user?id=']")
      if (submitterResult && submitterResult.results) {
        submitterResult.results.forEach((result: ElementResultItem) => {
          if (result.text) {
            submitters.push(result.text)
          }
        })
      }

      // Extract ranks
      const ranks: Array<number> = []
      const rankResult = scrapeResult.result.find((item: ElementResult) => item.selector === "span.rank")
      if (rankResult && rankResult.results) {
        rankResult.results.forEach((result: ElementResultItem) => {
          if (result.text) {
            // Extract number from rank text (e.g., "2." -> 2)
            const rankMatch = result.text.match(/(\d+)\./);
            if (rankMatch) {
              ranks.push(parseInt(rankMatch[1], 10))
            }
          }
        })
      }

      // Create and save the top 30 items
      const items: HackerNewsItem[] = []
      const savedCount = { successful: 0, duplicates: 0, updates: 0, errors: 0 }
      const maxItems = Math.min(30, titleLinks.length, commentLinks.length, submitters.length, ranks.length)

      for (let i = 0; i < maxItems; i++) {
        const titleLink = titleLinks[i]
        const commentLink = commentLinks[i]
        const submitter = submitters[i] || 'unknown'
        const rank = ranks[i]

        if (titleLink && commentLink && rank) {
          // Extract Hacker News ID from comment link (e.g., "item?id=12345" -> "12345")
          const idMatch = commentLink.match(/item\?id=(\d+)/);
          const hackerNewsId = idMatch ? idMatch[1] : '';

          if (!hackerNewsId) {
            console.warn(`Could not extract Hacker News ID from: ${commentLink}`);
            continue; // Skip items without valid ID
          }

          const newsItem = {
            title: titleLink.text,
            link: titleLink.href.startsWith('http') ? titleLink.href : `https://news.ycombinator.com/${titleLink.href}`,
            commentLink: commentLink
          }

          // Save to database with hackerNewsId and rank
          try {
            const result = await saveNewsItem({
              title: newsItem.title,
              url: newsItem.link,
              submitter: submitter,
              hackerNewsId: hackerNewsId,
              rank: rank
            })

            if (result === 'saved') {
              savedCount.successful++
            } else if (result === 'skipped') {
              savedCount.duplicates++
            } else if (result === 'updated') {
              savedCount.updates++
            }
          } catch (error) {
            console.error(`Failed to save news item "${newsItem.title}":`, error)
            savedCount.errors++
          }

          items.push(newsItem)
        }
      }

      return {
        success: true,
        items: items,
        message: `Successfully extracted ${items.length} items from Hacker News homepage. Saved: ${savedCount.successful} new, ${savedCount.updates} updated, ${savedCount.duplicates} duplicates, ${savedCount.errors} errors.`
      }
    } catch (error) {
      const errorMessage = error instanceof Error ? error.message : 'Unknown error';
      throw new Error(`Failed to process Hacker News data: ${errorMessage}`)
    }
  }
})

cloudflare:hackerNewsHome by Daniel Zavala de la Vega - Hive