cloudflare:hackerNewsCommentPage

Extract top-level comments from a Hacker News item page

Published August 5, 2025
Source Code
// TASK: hackerNewsCommentPage
// Run this task with:
// forge task:run cloudflare:hackerNewsCommentPage --hackerNewsId 44440968

import axios from 'axios';
import { createTask } from '@forgehive/task'
import { Schema } from '@forgehive/schema'

const name = 'cloudflare:hackerNewsCommentPage'
const description = 'Extract top-level comments from a Hacker News item page'

const schema = new Schema({
  hackerNewsId: Schema.number()
})

interface ElementResult {
  selector: string;
  results: Array<{
    text?: string;
    attributes?: Array<{
      name: string;
      value: string;
    }>;
  }>;
}

interface ElementAttributes {
  name: string;
  value: string;
}

interface ElementResultItem {
  text?: string;
  attributes?: ElementAttributes[];
}

interface HackerNewsComment {
  text: string;
  author?: string;
}

const boundaries = {
  scrapeCommentPage: async (hackerNewsId: string) => {
    const apiToken = process.env.CLOUDFLARE_API_TOKEN
    const accountId = process.env.CLOUDFLARE_ACCOUNT_ID

    if (!apiToken || !accountId) {
      throw new Error('Missing Cloudflare credentials')
    }

    const url = `https://news.ycombinator.com/item?id=${hackerNewsId}`

    try {
      const response = await axios.post(
        `https://api.cloudflare.com/client/v4/accounts/${accountId}/browser-rendering/scrape`,
        {
          url,
          elements: [
            {
              selector: "td.ind[indent='0'] ~ td div.commtext"
            },
            {
              selector: "td.ind[indent='0'] ~ td a.hnuser"
            }
          ]
        },
        {
          headers: {
            'Content-Type': 'application/json',
            'Authorization': `Bearer ${apiToken}`
          }
        }
      )

      return response.data
    } catch (error: unknown) {
      const errorMessage = error instanceof Error ? error.message : 'Unknown error';
      throw new Error(`Failed to scrape Hacker News comments: ${errorMessage}`)
    }
  }
}

export const hackerNewsCommentPage = createTask({
  name,
  description,
  schema,
  boundaries,
  fn: async function ({ hackerNewsId }, { scrapeCommentPage }) {
    // Scrape the comment page
    const scrapeResult = await scrapeCommentPage(hackerNewsId.toString())

    if (!scrapeResult || !scrapeResult.result) {
      throw new Error('No scrape results found')
    }

                // Extract comment texts from indent=0 rows only
    const commentTexts: string[] = []
    const commentResult = scrapeResult.result.find((item: ElementResult) => item.selector === "td.ind[indent='0'] ~ td div.commtext")
    if (commentResult && commentResult.results) {
      commentResult.results.forEach((result: ElementResultItem) => {
        if (result.text) {
          commentTexts.push(result.text)
        }
      })
    }

    // Extract comment authors from indent=0 rows only
    const commentAuthors: string[] = []
    const authorResult = scrapeResult.result.find((item: ElementResult) => item.selector === "td.ind[indent='0'] ~ td a.hnuser")
    if (authorResult && authorResult.results) {
      authorResult.results.forEach((result: ElementResultItem) => {
        if (result.text) {
          commentAuthors.push(result.text)
        }
      })
    }

    // Combine comments with their authors - only top-level comments
    const comments: HackerNewsComment[] = []
    const maxComments = Math.min(commentTexts.length, commentAuthors.length)

    for (let i = 0; i < maxComments; i++) {
      comments.push({
        text: commentTexts[i],
        author: commentAuthors[i]
      })
    }

    // If we have more comment texts than authors, add the remaining comments without authors
    for (let i = maxComments; i < commentTexts.length; i++) {
      comments.push({
        text: commentTexts[i]
      })
    }

    return {
      hackerNewsId,
      comments,
      totalComments: comments.length
    }
  }
})