Extract top-level comments from a Hacker News item page
// TASK: hackerNewsCommentPage
// Run this task with:
// forge task:run cloudflare:hackerNewsCommentPage --hackerNewsId 44440968
import axios from 'axios';
import { createTask } from '@forgehive/task'
import { Schema } from '@forgehive/schema'
const name = 'cloudflare:hackerNewsCommentPage'
const description = 'Extract top-level comments from a Hacker News item page'
const schema = new Schema({
hackerNewsId: Schema.number()
})
interface ElementResult {
selector: string;
results: Array<{
text?: string;
attributes?: Array<{
name: string;
value: string;
}>;
}>;
}
interface ElementAttributes {
name: string;
value: string;
}
interface ElementResultItem {
text?: string;
attributes?: ElementAttributes[];
}
interface HackerNewsComment {
text: string;
author?: string;
}
const boundaries = {
scrapeCommentPage: async (hackerNewsId: string) => {
const apiToken = process.env.CLOUDFLARE_API_TOKEN
const accountId = process.env.CLOUDFLARE_ACCOUNT_ID
if (!apiToken || !accountId) {
throw new Error('Missing Cloudflare credentials')
}
const url = `https://news.ycombinator.com/item?id=${hackerNewsId}`
try {
const response = await axios.post(
`https://api.cloudflare.com/client/v4/accounts/${accountId}/browser-rendering/scrape`,
{
url,
elements: [
{
selector: "td.ind[indent='0'] ~ td div.commtext"
},
{
selector: "td.ind[indent='0'] ~ td a.hnuser"
}
]
},
{
headers: {
'Content-Type': 'application/json',
'Authorization': `Bearer ${apiToken}`
}
}
)
return response.data
} catch (error: unknown) {
const errorMessage = error instanceof Error ? error.message : 'Unknown error';
throw new Error(`Failed to scrape Hacker News comments: ${errorMessage}`)
}
}
}
export const hackerNewsCommentPage = createTask({
name,
description,
schema,
boundaries,
fn: async function ({ hackerNewsId }, { scrapeCommentPage }) {
// Scrape the comment page
const scrapeResult = await scrapeCommentPage(hackerNewsId.toString())
if (!scrapeResult || !scrapeResult.result) {
throw new Error('No scrape results found')
}
// Extract comment texts from indent=0 rows only
const commentTexts: string[] = []
const commentResult = scrapeResult.result.find((item: ElementResult) => item.selector === "td.ind[indent='0'] ~ td div.commtext")
if (commentResult && commentResult.results) {
commentResult.results.forEach((result: ElementResultItem) => {
if (result.text) {
commentTexts.push(result.text)
}
})
}
// Extract comment authors from indent=0 rows only
const commentAuthors: string[] = []
const authorResult = scrapeResult.result.find((item: ElementResult) => item.selector === "td.ind[indent='0'] ~ td a.hnuser")
if (authorResult && authorResult.results) {
authorResult.results.forEach((result: ElementResultItem) => {
if (result.text) {
commentAuthors.push(result.text)
}
})
}
// Combine comments with their authors - only top-level comments
const comments: HackerNewsComment[] = []
const maxComments = Math.min(commentTexts.length, commentAuthors.length)
for (let i = 0; i < maxComments; i++) {
comments.push({
text: commentTexts[i],
author: commentAuthors[i]
})
}
// If we have more comment texts than authors, add the remaining comments without authors
for (let i = maxComments; i < commentTexts.length; i++) {
comments.push({
text: commentTexts[i]
})
}
return {
hackerNewsId,
comments,
totalComments: comments.length
}
}
})