From f11d3f2ac641bce64c5a674c0995cf85ffc7e37e Mon Sep 17 00:00:00 2001 From: zyachel Date: Mon, 8 Apr 2024 00:22:51 +0530 Subject: [PATCH] fix(parse): don't bail out on encountering weird characters replace all \x3C(<) with its valid unique escape sequence --- fetchers/answersFetcher.js | 3 ++- fetchers/fetcher.js | 12 +++++------- utils/parse.js | 11 +++++++++++ 3 files changed, 18 insertions(+), 8 deletions(-) create mode 100644 utils/parse.js diff --git a/fetchers/answersFetcher.js b/fetchers/answersFetcher.js index 733a768..39d7526 100644 --- a/fetchers/answersFetcher.js +++ b/fetchers/answersFetcher.js @@ -5,6 +5,7 @@ import * as cheerio from 'cheerio'; import getAxiosInstance from '../utils/getAxiosInstance.js'; import AppError from '../utils/AppError.js'; +import parse from '../utils/parse.js'; //////////////////////////////////////////////////////// // FUNCTION @@ -31,7 +32,7 @@ const answersFetcher = async (resourceStr, lang) => { if (!matches) return; // brittle logic, but works - const matchedPart = JSON.parse(JSON.parse(matches[1])).data; + const matchedPart = JSON.parse(parse(matches[1])).data; // only question block has this word if (typeof matchedPart.question?.viewerHasAnswered !== 'undefined') { diff --git a/fetchers/fetcher.js b/fetchers/fetcher.js index d3b7c8d..89b0166 100644 --- a/fetchers/fetcher.js +++ b/fetchers/fetcher.js @@ -5,6 +5,7 @@ import * as cheerio from 'cheerio'; import getAxiosInstance from '../utils/getAxiosInstance.js'; import AppError from '../utils/AppError.js'; +import parse from '../utils/parse.js'; //////////////////////////////////////////////////////// // FUNCTION @@ -18,19 +19,16 @@ import AppError from '../utils/AppError.js'; * await fetcher('topic/Space-Physics'); // will return 'space physics' topic object * await fetcher('profile/Charlie-Cheever'); // will return object containing information about charlie cheever */ -const fetcher = async ( - resourceStr, - { keyword, lang, toEncode = true } -) => { +const fetcher = async (resourceStr, { keyword, lang, toEncode = true }) => { try { // as url might contain unescaped chars. so, encoding it right away const str = toEncode ? encodeURIComponent(resourceStr) : resourceStr; const axiosInstance = getAxiosInstance(lang); const res = await axiosInstance.get(str); - + const $ = cheerio.load(res.data); - const regex = new RegExp(`"{\\\\"data\\\\":\\{\\\\"${keyword}.*\\}"`); // equivalent to /"\{\\"data\\":\{\\"searchConnection.*\}"/ + const regex = new RegExp(String.raw`"{\\"data\\":\{\\"${keyword}.*?\}"`); let rawData; $('body script').each((i, el) => { @@ -45,7 +43,7 @@ const fetcher = async ( if (!rawData) throw new AppError("couldn't retrieve data", 500); - return JSON.parse(rawData); + return parse(rawData); } catch (err) { const statusCode = err.response?.status; if (statusCode === 404) throw new AppError('Not found', 404); diff --git a/utils/parse.js b/utils/parse.js new file mode 100644 index 0000000..697dd9b --- /dev/null +++ b/utils/parse.js @@ -0,0 +1,11 @@ +const invalidLessThan = /\\x3C/g; +const validLessThan = '\\u003C'; + +/** + * parses and corrects invalid escape sequences + * @param {string} data + * @returns {Record} + */ +const parse = data => JSON.parse(data.replace(invalidLessThan, validLessThan)); + +export default parse;