From 677b846eb2ef0dc6475e92f9a6de1c9dff23e051 Mon Sep 17 00:00:00 2001 From: zyachel Date: Sun, 5 Mar 2023 00:34:53 +0530 Subject: [PATCH] fix: fix broken answers feed this commit fixes the issue of answers not being shown as the underlying HTML changed which broke old scraping logic BREAKING CHANGE: older versions will not work with answer route fix https://github.com/zyachel/quetre/issues/96 --- fetchers/answersFetcher.js | 76 +++++++++++++++++++++++++++ fetchers/getAnswers.js | 104 +++++++++++++++---------------------- 2 files changed, 119 insertions(+), 61 deletions(-) create mode 100644 fetchers/answersFetcher.js diff --git a/fetchers/answersFetcher.js b/fetchers/answersFetcher.js new file mode 100644 index 0000000..74343bf --- /dev/null +++ b/fetchers/answersFetcher.js @@ -0,0 +1,76 @@ +/* eslint-disable no-useless-catch */ +//////////////////////////////////////////////////////// +// IMPORTS +//////////////////////////////////////////////////////// +import * as cheerio from 'cheerio'; +import getAxiosInstance from '../utils/getAxiosInstance.js'; +import AppError from '../utils/AppError.js'; + +//////////////////////////////////////////////////////// +// FUNCTION +//////////////////////////////////////////////////////// +/** + * makes a call to quora.com(with the resourceStr appended) and returns parsed JSON containing the data about the resource requested. + * @param {string} resourceStr a string after the baseURL + * @param {string} lang additional options + * @returns JSON containing the result + */ +const answersFetcher = async (resourceStr, lang) => { + try { + const axiosInstance = getAxiosInstance(lang); + const res = await axiosInstance.get(encodeURIComponent(resourceStr)); + const $ = cheerio.load(res.data); + + const rawData = { question: null, answers: [], related: [], answerCount: 0 }; + + // there are about 9-10 script tags containing data we need + $('body script').each((i, el) => { + const text = $(el).html(); + const matches = text.match(/\.push\((".*")\);/); // data is contained like: someProp.push(""); + + if (!matches) return; + + // brittle logic, but works + const matchedPart = JSON.parse(JSON.parse(matches[1])).data; + + // only question block has this word + if (typeof matchedPart.question?.viewerHasAnswered !== 'undefined') { + rawData.question = matchedPart.question; + + // primary answer block + } else if (matchedPart.question?.answers?.edges) { + rawData.answers.push(matchedPart.question.answers.edges[0].node.answer); + + // other answer blocks + } else if ( + // eslint-disable-next-line no-underscore-dangle + matchedPart.node?.__typename === 'QuestionRelevantAnswerItem2' + ) { + rawData.answers.push(matchedPart.node.answer); + + // related questions block contains both answer count and related questions + } else if (matchedPart.bottomRelatedQuestionsInfo) { + rawData.related = matchedPart.bottomRelatedQuestionsInfo.relatedQuestions; + rawData.answerCount = matchedPart.answerCount; + } + }); + + if (!rawData.question) throw new AppError("couldn't retrieve data", 500); + + return rawData; + } catch (err) { + const statusCode = err.response?.status; + if (statusCode === 404) throw new AppError('Not found', 404); + else if (statusCode === 429 || statusCode === 403) + throw new AppError( + 'Quora is rate limiting this instance. Try another or host your own.', + 503 + ); + else throw err; + } +}; + +//////////////////////////////////////////////////////// +// EXPORTS +//////////////////////////////////////////////////////// +export default answersFetcher; diff --git a/fetchers/getAnswers.js b/fetchers/getAnswers.js index 6235e0c..fb36f72 100644 --- a/fetchers/getAnswers.js +++ b/fetchers/getAnswers.js @@ -1,87 +1,69 @@ //////////////////////////////////////////////////////// // IMPORTS //////////////////////////////////////////////////////// -// import log from '../utils/log.js'; -import AppError from '../utils/AppError.js'; import { quetrefy } from '../utils/urlModifiers.js'; -import fetcher from './fetcher.js'; +import answersFetcher from './answersFetcher.js'; //////////////////////////////////////////////////////// // FUNCTION //////////////////////////////////////////////////////// -const KEYWORD = 'question'; - const getAnswers = async (slug, lang) => { // getting data and destructuring it in case it exists - const res = await fetcher(slug, { keyword: KEYWORD, lang }); - - const { - data: { [KEYWORD]: rawData }, - } = JSON.parse(res); - - if (!rawData) - throw new AppError( - "Answers couldn't be fetched. Recheck the URL, or resend the request if you believe the URL is correct.", - 404 - ); + const rawData = await answersFetcher(slug, lang); // array containing all the answers with metadata - const ansArr = rawData.pagedListDataConnection.edges - .filter(ansObj => ansObj.node.answer !== undefined) - .map(ansObj => ({ - text: JSON.parse(ansObj.node.answer.content).sections, - isViewable: !!ansObj.node.answer.viewerHasAccess, - creationTime: ansObj.node.answer.creationTime, - updatedTime: ansObj.node.answer.updatedTime, - numComments: ansObj.node.answer.numDisplayComments, - numUpvotes: ansObj.node.answer.numUpvotes, - numViews: ansObj.node.answer.numViews, - numShares: ansObj.node.answer.numSharers, - numAnswerRequests: ansObj.node.answer.numRequesters, - aid: ansObj.node.answer.aid, - isBusinessAnswer: ansObj.node.answer.businessAnswer, - author: { - uid: ansObj.node.answer.author.uid, - isAnon: ansObj.node.answer.author.isAnon, - image: ansObj.node.answer.author.profileImageUrl, - isVerified: ansObj.node.answer.author.isVerified, - url: quetrefy(ansObj.node.answer.author.profileUrl), - name: `${ansObj.node.answer.author.names[0].givenName} ${ansObj.node.answer.author.names[0].familyName}`, - credential: ansObj.node.answer.authorCredential?.translatedString, - // additionalCredentials: ansObj.node.answer?.credibilityFacts.map(), - }, - originalQuestion: { - text: JSON.parse(ansObj.node.answer.question.title).sections, - url: quetrefy(ansObj.node.answer.question.url), - qid: ansObj.node.answer.question.qid, - isDeleted: ansObj.node.answer.question.isDeleted, - }, - })); + const ansArr = rawData.answers.map(answer => ({ + text: JSON.parse(answer.content).sections, + isViewable: !!answer.viewerHasAccess, + creationTime: answer.creationTime, + updatedTime: answer.updatedTime, + numComments: answer.numDisplayComments, + numUpvotes: answer.numUpvotes, + numViews: answer.numViews, + numShares: answer.numShares, + numAnswerRequests: answer.numRequesters, + aid: answer.aid, + isBusinessAnswer: answer.businessAnswer, + author: { + uid: answer.author.uid, + isAnon: answer.author.isAnon, + image: answer.author.profileImageUrl, + isVerified: answer.author.isVerified, + url: quetrefy(answer.author.profileUrl), + name: `${answer.author.names[0].givenName} ${answer.author.names[0].familyName}`, + credential: answer.authorCredential?.translatedString, + // additionalCredentials: answer.node.answer?.credibilityFacts.map(), + }, + originalQuestion: { + text: JSON.parse(answer.question.title).sections, + url: quetrefy(answer.question.url), + qid: answer.question.qid, + isDeleted: answer.question.isDeleted, + }, + })); // main data object to be returned const data = { question: { - text: JSON.parse(rawData.title).sections, - url: quetrefy(rawData.url), - qid: rawData.qid, - idDeleted: rawData.isDeleted, - isViewable: rawData.isVisibleToViewer, - askerUid: rawData.asker.uid, + text: JSON.parse(rawData.question.title).sections, + url: quetrefy(rawData.question.url), + qid: rawData.question.qid, + idDeleted: rawData.question.isDeleted, + isViewable: rawData.question.isVisibleToViewer, + askerUid: rawData.question.asker.uid, }, numAnswers: rawData.answerCount, answers: ansArr, - topics: rawData.topics.map(topicObj => ({ + topics: rawData.question.topics.map(topicObj => ({ tid: topicObj.tid, name: topicObj.name, url: quetrefy(topicObj.url), })), - relatedQuestions: rawData.bottomRelatedQuestionsInfo.relatedQuestions.map( - questionObj => ({ - qid: questionObj.qid, - url: quetrefy(questionObj.url), - text: JSON.parse(questionObj.title).sections, - }) - ), + relatedQuestions: rawData.related.map(questionObj => ({ + qid: questionObj.qid, + url: quetrefy(questionObj.url), + text: JSON.parse(questionObj.title).sections, + })), }; return data;