List post duplicates (resolves #574).

This commit is contained in:
Daniel Valentine 2022-11-09 09:16:51 -07:00
parent fade305f90
commit e579b97442
No known key found for this signature in database
GPG key ID: C82492E4FF813823
21 changed files with 695 additions and 250 deletions

View file

@ -158,10 +158,21 @@ fn request(method: &'static Method, path: String, redirect: bool, quarantine: bo
method,
response
.headers()
.get("Location")
.get(header::LOCATION)
.map(|val| {
let new_url = percent_encode(val.as_bytes(), CONTROLS).to_string();
format!("{}{}raw_json=1", new_url, if new_url.contains('?') { "&" } else { "?" })
// We need to make adjustments to the URI
// we get back from Reddit. Namely, we
// must:
//
// 1. Remove the authority (e.g.
// https://www.reddit.com) that may be
// present, so that we recurse on the
// path (and query parameters) as
// required.
//
// 2. Percent-encode the path.
let new_path = percent_encode(val.as_bytes(), CONTROLS).to_string().trim_start_matches(REDDIT_URL_BASE).to_string();
format!("{}{}raw_json=1", new_path, if new_path.contains('?') { "&" } else { "?" })
})
.unwrap_or_default()
.to_string(),

228
src/duplicates.rs Normal file
View file

@ -0,0 +1,228 @@
// Handler for post duplicates.
use crate::client::json;
use crate::server::RequestExt;
use crate::subreddit::{can_access_quarantine, quarantine};
use crate::utils::{error, filter_posts, get_filters, parse_post, template, Post, Preferences};
use askama::Template;
use hyper::{Body, Request, Response};
use serde_json::Value;
use std::borrow::ToOwned;
use std::collections::HashSet;
use std::vec::Vec;
/// DuplicatesParams contains the parameters in the URL.
struct DuplicatesParams {
before: String,
after: String,
sort: String,
}
/// DuplicatesTemplate defines an Askama template for rendering duplicate
/// posts.
#[derive(Template)]
#[template(path = "duplicates.html")]
struct DuplicatesTemplate {
/// params contains the relevant request parameters.
params: DuplicatesParams,
/// post is the post whose ID is specified in the reqeust URL. Note that
/// this is not necessarily the "original" post.
post: Post,
/// duplicates is the list of posts that, per Reddit, are duplicates of
/// Post above.
duplicates: Vec<Post>,
/// prefs are the user preferences.
prefs: Preferences,
/// url is the request URL.
url: String,
/// num_posts_filtered counts how many posts were filtered from the
/// duplicates list.
num_posts_filtered: u64,
/// all_posts_filtered is true if every duplicate was filtered. This is an
/// edge case but can still happen.
all_posts_filtered: bool,
}
/// Make the GET request to Reddit. It assumes `req` is the appropriate Reddit
/// REST endpoint for enumerating post duplicates.
pub async fn item(req: Request<Body>) -> Result<Response<Body>, String> {
let path: String = format!("{}.json?{}&raw_json=1", req.uri().path(), req.uri().query().unwrap_or_default());
let sub = req.param("sub").unwrap_or_default();
let quarantined = can_access_quarantine(&req, &sub);
// Log the request in debugging mode
#[cfg(debug_assertions)]
dbg!(req.param("id").unwrap_or_default());
// Send the GET, and await JSON.
match json(path, quarantined).await {
// Process response JSON.
Ok(response) => {
let filters = get_filters(&req);
let post = parse_post(&response[0]["data"]["children"][0]).await;
let (duplicates, num_posts_filtered, all_posts_filtered) = parse_duplicates(&response[1], &filters).await;
// These are the values for the "before=", "after=", and "sort="
// query params, respectively.
let mut before: String = String::new();
let mut after: String = String::new();
let mut sort: String = String::new();
// FIXME: We have to perform a kludge to work around a Reddit API
// bug.
//
// The JSON object in "data" will never contain a "before" value so
// it is impossible to use it to determine our position in a
// listing. We'll make do by getting the ID of the first post in
// the listing, setting that as our "before" value, and ask Reddit
// to give us a batch of duplicate posts up to that post.
//
// Likewise, if we provide a "before" request in the GET, the
// result won't have an "after" in the JSON, in addition to missing
// the "before." So we will have to use the final post in the list
// of duplicates.
//
// That being said, we'll also need to capture the value of the
// "sort=" parameter as well, so we will need to inspect the
// query key-value pairs anyway.
let l = duplicates.len();
if l > 0 {
// This gets set to true if "before=" is one of the GET params.
let mut have_before: bool = false;
// This gets set to true if "after=" is one of the GET params.
let mut have_after: bool = false;
// Inspect the query key-value pairs. We will need to record
// the value of "sort=", along with checking to see if either
// one of "before=" or "after=" are given.
//
// If we're in the middle of the batch (evidenced by the
// presence of a "before=" or "after=" parameter in the GET),
// then use the first post as the "before" reference.
//
// We'll do this iteratively. Better than with .map_or()
// since a closure will continue to operate on remaining
// elements even after we've determined one of "before=" or
// "after=" (or both) are in the GET request.
//
// In practice, here should only ever be one of "before=" or
// "after=" and never both.
let query_str = req.uri().query().unwrap_or_default().to_string();
if !query_str.is_empty() {
for param in query_str.split('&') {
let kv: Vec<&str> = param.split('=').collect();
if kv.len() < 2 {
// Reject invalid query parameter.
continue;
}
let key: &str = kv[0];
match key {
"before" => have_before = true,
"after" => have_after = true,
"sort" => {
let val: &str = kv[1];
match val {
"new" | "num_comments" => sort = val.to_string(),
_ => {}
}
}
_ => {}
}
}
}
if have_after {
before = "t3_".to_owned();
before.push_str(&duplicates[0].id);
}
// Address potentially missing "after". If "before=" is in the
// GET, then "after" will be null in the JSON (see FIXME
// above).
if have_before {
// The next batch will need to start from one after the
// last post in the current batch.
after = "t3_".to_owned();
after.push_str(&duplicates[l - 1].id);
// Here is where things get terrible. Notice that we
// haven't set `before`. In order to do so, we will
// need to know if there is a batch that exists before
// this one, and doing so requires actually fetching the
// previous batch. In other words, we have to do yet one
// more GET to Reddit. There is no other way to determine
// whether or not to define `before`.
//
// We'll mitigate that by requesting at most one duplicate.
let new_path: String = format!(
"{}.json?before=t3_{}&sort={}&limit=1&raw_json=1",
req.uri().path(),
&duplicates[0].id,
if sort.is_empty() { "num_comments".to_string() } else { sort.clone() }
);
match json(new_path, true).await {
Ok(response) => {
if !response[1]["data"]["children"].as_array().unwrap_or(&Vec::new()).is_empty() {
before = "t3_".to_owned();
before.push_str(&duplicates[0].id);
}
}
Err(msg) => {
// Abort entirely if we couldn't get the previous
// batch.
return error(req, msg).await;
}
}
} else {
after = response[1]["data"]["after"].as_str().unwrap_or_default().to_string();
}
}
let url = req.uri().to_string();
template(DuplicatesTemplate {
params: DuplicatesParams { before, after, sort },
post,
duplicates,
prefs: Preferences::new(req),
url,
num_posts_filtered,
all_posts_filtered,
})
}
// Process error.
Err(msg) => {
if msg == "quarantined" {
let sub = req.param("sub").unwrap_or_default();
quarantine(req, sub)
} else {
error(req, msg).await
}
}
}
}
// DUPLICATES
async fn parse_duplicates(json: &serde_json::Value, filters: &HashSet<String>) -> (Vec<Post>, u64, bool) {
let post_duplicates: &Vec<Value> = &json["data"]["children"].as_array().map_or(Vec::new(), ToOwned::to_owned);
let mut duplicates: Vec<Post> = Vec::new();
// Process each post and place them in the Vec<Post>.
for val in post_duplicates.iter() {
let post: Post = parse_post(val).await;
duplicates.push(post);
}
let (num_posts_filtered, all_posts_filtered) = filter_posts(&mut duplicates, filters);
(duplicates, num_posts_filtered, all_posts_filtered)
}

View file

@ -3,6 +3,7 @@
#![allow(clippy::cmp_owned)]
// Reference local files
mod duplicates;
mod post;
mod search;
mod settings;
@ -244,6 +245,11 @@ async fn main() {
app.at("/comments/:id/:title").get(|r| post::item(r).boxed());
app.at("/comments/:id/:title/:comment_id").get(|r| post::item(r).boxed());
app.at("/r/:sub/duplicates/:id").get(|r| duplicates::item(r).boxed());
app.at("/r/:sub/duplicates/:id/:title").get(|r| duplicates::item(r).boxed());
app.at("/duplicates/:id").get(|r| duplicates::item(r).boxed());
app.at("/duplicates/:id/:title").get(|r| duplicates::item(r).boxed());
app.at("/r/:sub/search").get(|r| search::find(r).boxed());
app

View file

@ -3,7 +3,7 @@ use crate::client::json;
use crate::server::RequestExt;
use crate::subreddit::{can_access_quarantine, quarantine};
use crate::utils::{
error, format_num, format_url, get_filters, param, rewrite_urls, setting, template, time, val, Author, Awards, Comment, Flags, Flair, FlairPart, Media, Post, Preferences,
error, format_num, get_filters, param, parse_post, rewrite_urls, setting, template, time, val, Author, Awards, Comment, Flair, FlairPart, Post, Preferences,
};
use hyper::{Body, Request, Response};
@ -54,7 +54,7 @@ pub async fn item(req: Request<Body>) -> Result<Response<Body>, String> {
// Otherwise, grab the JSON output from the request
Ok(response) => {
// Parse the JSON into Post and Comment structs
let post = parse_post(&response[0]).await;
let post = parse_post(&response[0]["data"]["children"][0]).await;
let comments = parse_comments(&response[1], &post.permalink, &post.author.name, highlighted_comment, &get_filters(&req));
let url = req.uri().to_string();
@ -80,92 +80,6 @@ pub async fn item(req: Request<Body>) -> Result<Response<Body>, String> {
}
}
// POSTS
async fn parse_post(json: &serde_json::Value) -> Post {
// Retrieve post (as opposed to comments) from JSON
let post: &serde_json::Value = &json["data"]["children"][0];
// Grab UTC time as unix timestamp
let (rel_time, created) = time(post["data"]["created_utc"].as_f64().unwrap_or_default());
// Parse post score and upvote ratio
let score = post["data"]["score"].as_i64().unwrap_or_default();
let ratio: f64 = post["data"]["upvote_ratio"].as_f64().unwrap_or(1.0) * 100.0;
// Determine the type of media along with the media URL
let (post_type, media, gallery) = Media::parse(&post["data"]).await;
let awards: Awards = Awards::parse(&post["data"]["all_awardings"]);
let permalink = val(post, "permalink");
let body = if val(post, "removed_by_category") == "moderator" {
format!(
"<div class=\"md\"><p>[removed] — <a href=\"https://www.unddit.com{}\">view removed post</a></p></div>",
permalink
)
} else {
rewrite_urls(&val(post, "selftext_html"))
};
// Build a post using data parsed from Reddit post API
Post {
id: val(post, "id"),
title: val(post, "title"),
community: val(post, "subreddit"),
body,
author: Author {
name: val(post, "author"),
flair: Flair {
flair_parts: FlairPart::parse(
post["data"]["author_flair_type"].as_str().unwrap_or_default(),
post["data"]["author_flair_richtext"].as_array(),
post["data"]["author_flair_text"].as_str(),
),
text: val(post, "link_flair_text"),
background_color: val(post, "author_flair_background_color"),
foreground_color: val(post, "author_flair_text_color"),
},
distinguished: val(post, "distinguished"),
},
permalink,
score: format_num(score),
upvote_ratio: ratio as i64,
post_type,
media,
thumbnail: Media {
url: format_url(val(post, "thumbnail").as_str()),
alt_url: String::new(),
width: post["data"]["thumbnail_width"].as_i64().unwrap_or_default(),
height: post["data"]["thumbnail_height"].as_i64().unwrap_or_default(),
poster: "".to_string(),
},
flair: Flair {
flair_parts: FlairPart::parse(
post["data"]["link_flair_type"].as_str().unwrap_or_default(),
post["data"]["link_flair_richtext"].as_array(),
post["data"]["link_flair_text"].as_str(),
),
text: val(post, "link_flair_text"),
background_color: val(post, "link_flair_background_color"),
foreground_color: if val(post, "link_flair_text_color") == "dark" {
"black".to_string()
} else {
"white".to_string()
},
},
flags: Flags {
nsfw: post["data"]["over_18"].as_bool().unwrap_or(false),
stickied: post["data"]["stickied"].as_bool().unwrap_or(false),
},
domain: val(post, "domain"),
rel_time,
created,
comments: format_num(post["data"]["num_comments"].as_i64().unwrap_or_default()),
gallery,
awards,
}
}
// COMMENTS
fn parse_comments(json: &serde_json::Value, post_link: &str, post_author: &str, highlighted_comment: &str, filters: &HashSet<String>) -> Vec<Comment> {
// Parse the comment JSON into a Vector of Comments

View file

@ -107,7 +107,7 @@ pub async fn find(req: Request<Body>) -> Result<Response<Body>, String> {
} else {
match Post::fetch(&path, quarantined).await {
Ok((mut posts, after)) => {
let all_posts_filtered = filter_posts(&mut posts, &filters);
let (_, all_posts_filtered) = filter_posts(&mut posts, &filters);
let all_posts_hidden_nsfw = posts.iter().all(|p| p.flags.nsfw) && setting(&req, "show_nsfw") != "on";
template(SearchTemplate {
posts,

View file

@ -118,7 +118,7 @@ pub async fn community(req: Request<Body>) -> Result<Response<Body>, String> {
} else {
match Post::fetch(&path, quarantined).await {
Ok((mut posts, after)) => {
let all_posts_filtered = filter_posts(&mut posts, &filters);
let (_, all_posts_filtered) = filter_posts(&mut posts, &filters);
let all_posts_hidden_nsfw = posts.iter().all(|p| p.flags.nsfw) && setting(&req, "show_nsfw") != "on";
template(SubredditTemplate {
sub,

View file

@ -66,7 +66,7 @@ pub async fn profile(req: Request<Body>) -> Result<Response<Body>, String> {
// Request user posts/comments from Reddit
match Post::fetch(&path, false).await {
Ok((mut posts, after)) => {
let all_posts_filtered = filter_posts(&mut posts, &filters);
let (_, all_posts_filtered) = filter_posts(&mut posts, &filters);
let all_posts_hidden_nsfw = posts.iter().all(|p| p.flags.nsfw) && setting(&req, "show_nsfw") != "on";
template(UserTemplate {
user,

View file

@ -225,6 +225,7 @@ pub struct Post {
pub domain: String,
pub rel_time: String,
pub created: String,
pub num_duplicates: u64,
pub comments: (String, String),
pub gallery: Vec<GalleryMedia>,
pub awards: Awards,
@ -319,11 +320,12 @@ impl Post {
},
flags: Flags {
nsfw: data["over_18"].as_bool().unwrap_or_default(),
stickied: data["stickied"].as_bool().unwrap_or_default(),
stickied: data["stickied"].as_bool().unwrap_or_default() || data["pinned"].as_bool().unwrap_or_default(),
},
permalink: val(post, "permalink"),
rel_time,
created,
num_duplicates: post["data"]["num_duplicates"].as_u64().unwrap_or(0),
comments: format_num(data["num_comments"].as_i64().unwrap_or_default()),
gallery,
awards,
@ -511,15 +513,110 @@ pub fn get_filters(req: &Request<Body>) -> HashSet<String> {
setting(req, "filters").split('+').map(String::from).filter(|s| !s.is_empty()).collect::<HashSet<String>>()
}
/// Filters a `Vec<Post>` by the given `HashSet` of filters (each filter being a subreddit name or a user name). If a
/// `Post`'s subreddit or author is found in the filters, it is removed. Returns `true` if _all_ posts were filtered
/// out, or `false` otherwise.
pub fn filter_posts(posts: &mut Vec<Post>, filters: &HashSet<String>) -> bool {
/// Filters a `Vec<Post>` by the given `HashSet` of filters (each filter being
/// a subreddit name or a user name). If a `Post`'s subreddit or author is
/// found in the filters, it is removed.
///
/// The first value of the return tuple is the number of posts filtered. The
/// second return value is `true` if all posts were filtered.
pub fn filter_posts(posts: &mut Vec<Post>, filters: &HashSet<String>) -> (u64, bool) {
// This is the length of the Vec<Post> prior to applying the filter.
let lb: u64 = posts.len().try_into().unwrap_or(0);
if posts.is_empty() {
false
(0, false)
} else {
posts.retain(|p| !filters.contains(&p.community) && !filters.contains(&["u_", &p.author.name].concat()));
posts.is_empty()
posts.retain(|p| !(filters.contains(&p.community) || filters.contains(&["u_", &p.author.name].concat())));
// Get the length of the Vec<Post> after applying the filter.
// If lb > la, then at least one post was removed.
let la: u64 = posts.len().try_into().unwrap_or(0);
(lb - la, posts.is_empty())
}
}
/// Creates a [`Post`] from a provided JSON.
pub async fn parse_post(post: &serde_json::Value) -> Post {
// Grab UTC time as unix timestamp
let (rel_time, created) = time(post["data"]["created_utc"].as_f64().unwrap_or_default());
// Parse post score and upvote ratio
let score = post["data"]["score"].as_i64().unwrap_or_default();
let ratio: f64 = post["data"]["upvote_ratio"].as_f64().unwrap_or(1.0) * 100.0;
// Determine the type of media along with the media URL
let (post_type, media, gallery) = Media::parse(&post["data"]).await;
let awards: Awards = Awards::parse(&post["data"]["all_awardings"]);
let permalink = val(post, "permalink");
let body = if val(post, "removed_by_category") == "moderator" {
format!(
"<div class=\"md\"><p>[removed] — <a href=\"https://www.unddit.com{}\">view removed post</a></p></div>",
permalink
)
} else {
rewrite_urls(&val(post, "selftext_html"))
};
// Build a post using data parsed from Reddit post API
Post {
id: val(post, "id"),
title: val(post, "title"),
community: val(post, "subreddit"),
body,
author: Author {
name: val(post, "author"),
flair: Flair {
flair_parts: FlairPart::parse(
post["data"]["author_flair_type"].as_str().unwrap_or_default(),
post["data"]["author_flair_richtext"].as_array(),
post["data"]["author_flair_text"].as_str(),
),
text: val(post, "link_flair_text"),
background_color: val(post, "author_flair_background_color"),
foreground_color: val(post, "author_flair_text_color"),
},
distinguished: val(post, "distinguished"),
},
permalink,
score: format_num(score),
upvote_ratio: ratio as i64,
post_type,
media,
thumbnail: Media {
url: format_url(val(post, "thumbnail").as_str()),
alt_url: String::new(),
width: post["data"]["thumbnail_width"].as_i64().unwrap_or_default(),
height: post["data"]["thumbnail_height"].as_i64().unwrap_or_default(),
poster: String::new(),
},
flair: Flair {
flair_parts: FlairPart::parse(
post["data"]["link_flair_type"].as_str().unwrap_or_default(),
post["data"]["link_flair_richtext"].as_array(),
post["data"]["link_flair_text"].as_str(),
),
text: val(post, "link_flair_text"),
background_color: val(post, "link_flair_background_color"),
foreground_color: if val(post, "link_flair_text_color") == "dark" {
"black".to_string()
} else {
"white".to_string()
},
},
flags: Flags {
nsfw: post["data"]["over_18"].as_bool().unwrap_or_default(),
stickied: post["data"]["stickied"].as_bool().unwrap_or_default() || post["data"]["pinned"].as_bool().unwrap_or(false),
},
domain: val(post, "domain"),
rel_time,
created,
num_duplicates: post["data"]["num_duplicates"].as_u64().unwrap_or(0),
comments: format_num(post["data"]["num_comments"].as_i64().unwrap_or_default()),
gallery,
awards,
}
}