basics working. search working
This commit is contained in:
@@ -5,7 +5,7 @@
|
||||
[env]
|
||||
|
||||
# Scope down tracing, to filter out external lib tracing.
|
||||
RUST_LOG="web_server=debug,lib_core=debug,lib_auth=debug,lib_utils=debug"
|
||||
RUST_LOG="web_frontend=info,lib_core=debug,lib_auth=debug,lib_utils=debug,web_api=debug,sqlx=debug"
|
||||
|
||||
# -- Service Environment Variables
|
||||
# IMPORTANT:
|
||||
@@ -18,11 +18,12 @@ RUST_LOG="web_server=debug,lib_core=debug,lib_auth=debug,lib_utils=debug"
|
||||
# e.g., "welcome" type of passwords.
|
||||
# i.e., Encryption not needed.
|
||||
|
||||
SERVICE_DB_URL="postgres://app_user:dev_only_pwd@localhost/app_db"
|
||||
SERVICE_DB_URL="postgres://postgres:postgres@db/mediamanager"
|
||||
DATABASE_URL="postgres://postgres:postgres@db/mediamanager"
|
||||
|
||||
SERVICE_PWD_KEY="CKUGFOD9_2Qf6Pn3ZFRYgPYb8ht4vKqEG9PGMXTB7497bT0367DjoaD6ydFnEVaIRda0kKeBZVCT5Hb62m2sCA"
|
||||
# SERVICE_PWD_KEY="CKUGFOD9_2Qf6Pn3ZFRYgPYb8ht4vKqEG9PGMXTB7497bT0367DjoaD6ydFnEVaIRda0kKeBZVCT5Hb62m2sCA"
|
||||
|
||||
SERVICE_TOKEN_KEY="9FoHBmkyxbgu_xFoQK7e0jz3RMNVJWgfvbVn712FBNH9LLaAWS3CS6Zpcg6RveiObvCUb6a2z-uAiLjhLh2igw"
|
||||
# SERVICE_TOKEN_KEY="9FoHBmkyxbgu_xFoQK7e0jz3RMNVJWgfvbVn712FBNH9LLaAWS3CS6Zpcg6RveiObvCUb6a2z-uAiLjhLh2igw"
|
||||
SERVICE_TOKEN_DURATION_SEC="1800" # 30 minutes
|
||||
|
||||
## -- ConfigMap
|
||||
|
||||
11
Cargo.toml
11
Cargo.toml
@@ -12,9 +12,14 @@ members = [
|
||||
"crates/libs/lib-core", # e.g., model, ctx, config.
|
||||
"crates/libs/lib-macros", # e.g., macros.
|
||||
"crates/libs/lib-scraper", # e.g., scraping logic.
|
||||
"crates/libs/lib-components", # e.g., Web HTML components.
|
||||
|
||||
# -- Application Services
|
||||
"crates/services/web-server",
|
||||
"crates/services/web-api",
|
||||
"crates/services/web-frontend",
|
||||
|
||||
# -- Scrapers
|
||||
"crates/services/scrapers/boxnovel",
|
||||
|
||||
# -- Tools
|
||||
"crates/tools/gen-key",
|
||||
@@ -29,4 +34,6 @@ sea-query-binder = { version = "0.6", features = ["sqlx-postgres", "with-uuid",
|
||||
serde = { version = "1", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
serde_with = {version = "3", features = ["time_0_3"]}
|
||||
derive_more = {version = "1.0", features = ["from"] }
|
||||
derive_more = {version = "1.0", features = ["from"] }
|
||||
data-url = "0.3.1"
|
||||
rayon = "1.10.0"
|
||||
|
||||
12
crates/libs/lib-components/Cargo.toml
Normal file
12
crates/libs/lib-components/Cargo.toml
Normal file
@@ -0,0 +1,12 @@
|
||||
[package]
|
||||
name = "lib-components"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[lib]
|
||||
doctest = false
|
||||
|
||||
[lints]
|
||||
workspace = true
|
||||
|
||||
[dependencies]
|
||||
5
crates/libs/lib-components/src/lib.rs
Normal file
5
crates/libs/lib-components/src/lib.rs
Normal file
@@ -0,0 +1,5 @@
|
||||
//! The utils module is designed to export independent sub-modules to the application code.
|
||||
//!
|
||||
//! Note: Even if the util sub-modules consist of a single file, they contain their own errors
|
||||
//! for improved compartmentalization.
|
||||
//!
|
||||
@@ -7,11 +7,12 @@ use modql::field::{SeaField, SeaFields, HasSeaFields};
|
||||
use modql::filter::{FilterGroups, ListOptions};
|
||||
use modql::SIden;
|
||||
use sea_query::{
|
||||
Condition, Expr, Iden, IntoIden, PostgresQueryBuilder, Query, TableRef,
|
||||
Condition, Expr, Func, Iden, IntoIden, PostgresQueryBuilder, Query, SimpleExpr, TableRef
|
||||
};
|
||||
use sea_query_binder::SqlxBinder;
|
||||
use sqlx::postgres::PgRow;
|
||||
use sqlx::FromRow;
|
||||
use tracing::debug;
|
||||
|
||||
// const LIST_LIMIT_DEFAULT: i64 = 1000;
|
||||
// const LIST_LIMIT_MAX: i64 = 5000;
|
||||
@@ -134,10 +135,11 @@ pub async fn list<MC, E, F>(
|
||||
mm: &ModelManager,
|
||||
filter: Option<F>,
|
||||
list_options: Option<ListOptions>,
|
||||
expressions: Vec<SimpleExpr>,
|
||||
) -> Result<Vec<E>>
|
||||
where
|
||||
MC: DbBmc,
|
||||
F: Into<FilterGroups>,
|
||||
F: Into<FilterGroups> + std::fmt::Debug,
|
||||
E: for<'r> FromRow<'r, PgRow> + Unpin + Send,
|
||||
E: HasSeaFields,
|
||||
{
|
||||
@@ -145,7 +147,9 @@ where
|
||||
|
||||
// -- Build the query
|
||||
let mut query = Query::select();
|
||||
query.from(MC::table_ref()).columns(E::sea_idens());
|
||||
query.from(MC::table_ref()).columns(E::sea_idens().into_iter().filter(|iden| {
|
||||
iden.to_string() != "search_rank"
|
||||
}));
|
||||
|
||||
// condition from filter
|
||||
if let Some(filter) = filter {
|
||||
@@ -153,6 +157,11 @@ where
|
||||
let cond: Condition = filters.try_into()?;
|
||||
query.cond_where(cond);
|
||||
}
|
||||
|
||||
for exp in expressions {
|
||||
query.expr(exp);
|
||||
}
|
||||
|
||||
// list options
|
||||
let list_options = compute_list_options(list_options)?;
|
||||
list_options.apply_to_sea_query(&mut query);
|
||||
|
||||
@@ -1,30 +1,36 @@
|
||||
use crate::ctx::Ctx;
|
||||
use crate::model::base::{self, DbBmc};
|
||||
use crate::model::utils::validate_uri_type;
|
||||
use crate::model::ModelManager;
|
||||
use crate::model::Result;
|
||||
use crate::model::Validation;
|
||||
use crate::model::utils::validate_uri_type;
|
||||
// use crate::model::utils::time_to_sea_value;
|
||||
// use lib_utils::time::Rfc3339;
|
||||
use modql::field::Fields;
|
||||
use modql::field::{Fields, HasSeaFields};
|
||||
use modql::filter::{
|
||||
FilterNodes, ListOptions, OpValsInt64, OpValsString,
|
||||
FilterGroups, FilterNodes, ListOptions, OpValsInt64, OpValsString,
|
||||
};
|
||||
use sea_query::{Condition, Expr, Func, Iden, PostgresQueryBuilder, Query};
|
||||
use sea_query_binder::SqlxBinder;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use sqlx::types::time::OffsetDateTime;
|
||||
use sqlx::FromRow;
|
||||
|
||||
use super::ValidationError;
|
||||
|
||||
const VALID_STATUS: &[&str] = &["completed", "ongoing"];
|
||||
|
||||
// region: --- Book Types
|
||||
#[derive(Debug, Clone, Fields, FromRow, Deserialize, Serialize)]
|
||||
pub struct Book {
|
||||
pub id: i64,
|
||||
pub title: String,
|
||||
pub cover: String,
|
||||
pub cover: Option<String>,
|
||||
pub summary: String,
|
||||
pub authors: Vec<String>,
|
||||
pub genres: Vec<String>,
|
||||
pub status: String,
|
||||
pub search_rank: Option<f32>,
|
||||
pub origin_book_url: String,
|
||||
pub origin_book_id: String,
|
||||
pub origin_id: i64,
|
||||
@@ -42,6 +48,7 @@ pub struct BookStub {
|
||||
pub origin_book_url: String,
|
||||
pub origin_book_id: String,
|
||||
pub origin_id: i64,
|
||||
pub search_rank: Option<f32>,
|
||||
pub created_at: OffsetDateTime,
|
||||
pub updated_at: OffsetDateTime,
|
||||
}
|
||||
@@ -49,7 +56,7 @@ pub struct BookStub {
|
||||
#[derive(Debug, Clone, Fields, Deserialize, Serialize)]
|
||||
pub struct BookForCreate {
|
||||
pub title: String,
|
||||
pub summary: String,
|
||||
pub summary: String,
|
||||
pub authors: Vec<String>,
|
||||
pub genres: Vec<String>,
|
||||
pub status: String,
|
||||
@@ -61,13 +68,18 @@ pub struct BookForCreate {
|
||||
|
||||
impl Validation for BookForCreate {
|
||||
fn validate(&self) -> Result<()> {
|
||||
match &self.cover {
|
||||
Some(cover) => {
|
||||
validate_uri_type(cover.as_str(), "image")?;
|
||||
Ok(())
|
||||
},
|
||||
None => Ok(())
|
||||
if let Some(cover) = &self.cover {
|
||||
validate_uri_type(cover.as_str(), "image")?;
|
||||
}
|
||||
|
||||
if !VALID_STATUS.contains(&self.status.as_str()) {
|
||||
Err(ValidationError::InvalidBookStatus {
|
||||
status: self.status.clone(),
|
||||
valid_status: VALID_STATUS.iter().map(|s| s.to_string()).collect(),
|
||||
})?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -79,9 +91,11 @@ pub struct BookFilter {
|
||||
pub genres: Option<OpValsString>,
|
||||
pub status: Option<OpValsString>,
|
||||
pub origin_id: Option<OpValsInt64>,
|
||||
pub origin_book_id: Option<OpValsString>,
|
||||
pub search: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Fields, Deserialize, Serialize, Validation)]
|
||||
#[derive(Debug, Clone, Fields, Deserialize, Serialize, PartialEq)]
|
||||
pub struct BookForUpdate {
|
||||
pub title: Option<String>,
|
||||
pub summary: Option<String>,
|
||||
@@ -89,11 +103,49 @@ pub struct BookForUpdate {
|
||||
pub genres: Option<Vec<String>>,
|
||||
pub status: Option<String>,
|
||||
pub cover: Option<String>,
|
||||
pub origin_book_url: Option<String>,
|
||||
pub origin_book_id: Option<String>,
|
||||
pub origin_book_url: Option<String>,
|
||||
pub origin_book_id: Option<String>,
|
||||
pub origin_id: Option<i64>,
|
||||
}
|
||||
|
||||
impl Into<BookForUpdate> for BookForCreate {
|
||||
fn into(self) -> BookForUpdate {
|
||||
BookForUpdate {
|
||||
title: Some(self.title),
|
||||
summary: Some(self.summary),
|
||||
authors: Some(self.authors),
|
||||
genres: Some(self.genres),
|
||||
status: Some(self.status),
|
||||
cover: self.cover,
|
||||
origin_book_url: Some(self.origin_book_url),
|
||||
origin_book_id: Some(self.origin_book_id),
|
||||
origin_id: Some(self.origin_id),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Validation for BookForUpdate {
|
||||
fn validate(&self) -> Result<()> {
|
||||
if let Some(cover) = &self.cover {
|
||||
validate_uri_type(cover.as_str(), "image")?;
|
||||
}
|
||||
|
||||
if let Some(status) = &self.status {
|
||||
if !VALID_STATUS.contains(&status.as_str()) {
|
||||
Err(ValidationError::InvalidBookStatus {
|
||||
status: status.clone(),
|
||||
valid_status: VALID_STATUS
|
||||
.iter()
|
||||
.map(|s| s.to_string())
|
||||
.collect(),
|
||||
})?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
// impl Validation for BookForUpdate {}
|
||||
|
||||
// endregion: --- Book Types
|
||||
@@ -124,7 +176,25 @@ impl BookBmc {
|
||||
filter: Option<Vec<BookFilter>>,
|
||||
list_options: Option<ListOptions>,
|
||||
) -> Result<Vec<Book>> {
|
||||
base::list::<Self, _, _>(ctx, mm, filter, list_options).await
|
||||
let mut expressions = vec![];
|
||||
|
||||
if let Some(search) = filter.clone().and_then(|filters| {
|
||||
filters
|
||||
.into_iter()
|
||||
.filter(|f| f.search.is_some())
|
||||
.next()
|
||||
.and_then(|f| f.search)
|
||||
}) {
|
||||
expressions.push(Expr::cust_with_values(
|
||||
"ts_rank(searchable, websearch_to_tsquery($1)) as search_rank",
|
||||
vec![search],
|
||||
));
|
||||
} else {
|
||||
expressions.push(Expr::cust(
|
||||
"null as search_rank",
|
||||
));
|
||||
}
|
||||
base::list::<Self, _, _>(ctx, mm, filter, list_options, expressions).await
|
||||
}
|
||||
|
||||
pub async fn list_stubs(
|
||||
@@ -133,7 +203,7 @@ impl BookBmc {
|
||||
filter: Option<Vec<BookFilter>>,
|
||||
list_options: Option<ListOptions>,
|
||||
) -> Result<Vec<BookStub>> {
|
||||
base::list::<Self, _, _>(ctx, mm, filter, list_options).await
|
||||
base::list::<Self, _, _>(ctx, mm, filter, list_options, vec![]).await
|
||||
}
|
||||
|
||||
pub async fn update(
|
||||
|
||||
@@ -22,8 +22,8 @@ pub struct Chapter {
|
||||
pub book_id: i64,
|
||||
pub volume: i64,
|
||||
pub number: i64,
|
||||
pub title: Option<String>,
|
||||
pub content: Option<String>,
|
||||
pub title: String,
|
||||
pub content: String,
|
||||
pub origin_chapter_url: String,
|
||||
pub origin_chapter_id: String,
|
||||
pub created_at: OffsetDateTime,
|
||||
@@ -43,9 +43,18 @@ pub struct ChapterStub {
|
||||
pub updated_at: OffsetDateTime,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Fields, Deserialize, Serialize)]
|
||||
pub struct ChapterStubForCreate {
|
||||
pub book_id: i64,
|
||||
pub volume: i64,
|
||||
pub number: i64,
|
||||
pub title: String,
|
||||
pub origin_chapter_url: String,
|
||||
pub origin_chapter_id: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Fields, Deserialize, Serialize, Validation)]
|
||||
pub struct ChapterForCreate {
|
||||
pub id: i64,
|
||||
pub book_id: i64,
|
||||
pub volume: i64,
|
||||
pub number: i64,
|
||||
@@ -58,7 +67,7 @@ pub struct ChapterForCreate {
|
||||
#[derive(Debug, Clone, Deserialize, FilterNodes, Default)]
|
||||
pub struct ChapterFilter {
|
||||
pub id: Option<OpValsInt64>,
|
||||
pub bookd_id: Option<OpValsInt64>,
|
||||
pub book_id: Option<OpValsInt64>,
|
||||
pub volume: Option<OpValsInt64>,
|
||||
pub number: Option<OpValsInt64>,
|
||||
pub title: Option<OpValsString>,
|
||||
@@ -66,13 +75,21 @@ pub struct ChapterFilter {
|
||||
pub origin_chapter_url: Option<OpValsString>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Fields, Deserialize, Serialize, Validation)]
|
||||
#[derive(Debug, Clone, Fields, Deserialize, Serialize, Validation, PartialEq)]
|
||||
pub struct ChapterForUpdate {
|
||||
pub id: i64,
|
||||
pub title: Option<String>,
|
||||
pub content: Option<String>,
|
||||
}
|
||||
|
||||
impl Into<ChapterForUpdate> for ChapterForCreate {
|
||||
fn into(self) -> ChapterForUpdate {
|
||||
ChapterForUpdate {
|
||||
title: Some(self.title),
|
||||
content: Some(self.content),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// endregion: --- Chapter Types
|
||||
|
||||
// region: --- ChapterBmc
|
||||
@@ -101,7 +118,7 @@ impl ChapterBmc {
|
||||
filter: Option<Vec<ChapterFilter>>,
|
||||
list_options: Option<ListOptions>,
|
||||
) -> Result<Vec<Chapter>> {
|
||||
base::list::<Self, _, _>(ctx, mm, filter, list_options).await
|
||||
base::list::<Self, _, _>(ctx, mm, filter, list_options, vec![]).await
|
||||
}
|
||||
|
||||
pub async fn list_stubs(
|
||||
@@ -110,7 +127,7 @@ impl ChapterBmc {
|
||||
filter: Option<Vec<ChapterFilter>>,
|
||||
list_options: Option<ListOptions>,
|
||||
) -> Result<Vec<ChapterStub>> {
|
||||
base::list::<Self, _, _>(ctx, mm, filter, list_options).await
|
||||
base::list::<Self, _, _>(ctx, mm, filter, list_options, vec![]).await
|
||||
}
|
||||
|
||||
pub async fn update(
|
||||
|
||||
@@ -18,9 +18,11 @@ pub enum Error {
|
||||
max: i64,
|
||||
actual: i64,
|
||||
},
|
||||
// Validation errors
|
||||
#[from]
|
||||
ValidationError(ValidationError),
|
||||
|
||||
|
||||
// -- Modules
|
||||
#[from]
|
||||
Pwd(pwd::Error),
|
||||
|
||||
@@ -38,13 +38,22 @@ pub struct OriginFilter {
|
||||
pub url: Option<OpValsString>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Fields, Deserialize, Serialize, Validation)]
|
||||
#[derive(Debug, Clone, Fields, Deserialize, Serialize, Validation, PartialEq)]
|
||||
pub struct OriginForUpdate {
|
||||
pub id: i64,
|
||||
// pub id: i64,
|
||||
pub name: Option<String>,
|
||||
pub url: Option<String>,
|
||||
}
|
||||
|
||||
impl Into<OriginForUpdate> for OriginForCreate {
|
||||
fn into(self) -> OriginForUpdate {
|
||||
OriginForUpdate {
|
||||
name: Some(self.name),
|
||||
url: Some(self.url),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// endregion: --- Chapter Types
|
||||
|
||||
// region: --- ChapterBmc
|
||||
@@ -73,7 +82,7 @@ impl OriginBmc {
|
||||
filter: Option<Vec<OriginFilter>>,
|
||||
list_options: Option<ListOptions>,
|
||||
) -> Result<Vec<Origin>> {
|
||||
base::list::<Self, _, _>(ctx, mm, filter, list_options).await
|
||||
base::list::<Self, _, _>(ctx, mm, filter, list_options, vec![]).await
|
||||
}
|
||||
|
||||
pub async fn update(
|
||||
|
||||
@@ -88,7 +88,7 @@ impl TaskBmc {
|
||||
filter: Option<Vec<TaskFilter>>,
|
||||
list_options: Option<ListOptions>,
|
||||
) -> Result<Vec<Task>> {
|
||||
base::list::<Self, _, _>(ctx, mm, filter, list_options).await
|
||||
base::list::<Self, _, _>(ctx, mm, filter, list_options, vec![]).await
|
||||
}
|
||||
|
||||
pub async fn update(
|
||||
|
||||
@@ -16,6 +16,10 @@ pub enum ValidationError {
|
||||
url: String,
|
||||
data_url_error: String,
|
||||
},
|
||||
InvalidBookStatus {
|
||||
status: String,
|
||||
valid_status: Vec<String>,
|
||||
}
|
||||
}
|
||||
|
||||
// region: --- Error Boilerplate
|
||||
|
||||
@@ -12,9 +12,14 @@ workspace = true
|
||||
[dependencies]
|
||||
# -- App Libs
|
||||
lib-core = { path = "../../libs/lib-core"}
|
||||
lib-utils = { path = "../../libs/lib-utils"}
|
||||
|
||||
# -- External Libs
|
||||
serde.workspace = true
|
||||
serde_with.workspace = true
|
||||
serde_json.workspace = true
|
||||
derive_more.workspace = true
|
||||
data-url.workspace = true
|
||||
rayon.workspace = true
|
||||
url = { version = "2.5.2", features = ["serde"] }
|
||||
reqwest = { version = "0.12.8", features = ["blocking", "json"] }
|
||||
|
||||
395
crates/libs/lib-scraper/src/api.rs
Normal file
395
crates/libs/lib-scraper/src/api.rs
Normal file
@@ -0,0 +1,395 @@
|
||||
use lib_core::model::{
|
||||
book::{Book, BookForCreate, BookForUpdate},
|
||||
chapter::{Chapter, ChapterForCreate, ChapterForUpdate, ChapterStub},
|
||||
origin::{Origin, OriginForCreate, OriginForUpdate},
|
||||
};
|
||||
use lib_utils::rpc_objects::RpcResponse;
|
||||
|
||||
use crate::{Api, Result};
|
||||
use serde_json::{json, to_string_pretty};
|
||||
use url::Url;
|
||||
|
||||
pub struct RpcApi {
|
||||
url: Url,
|
||||
client: reqwest::blocking::Client,
|
||||
}
|
||||
|
||||
impl RpcApi {
|
||||
pub fn new(url: Url) -> Self {
|
||||
Self {
|
||||
url,
|
||||
client: reqwest::blocking::Client::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn create_origin(&self, origin: &OriginForCreate) -> Result<Origin> {
|
||||
let req = json!({
|
||||
"method": "create_origin",
|
||||
"id": None::<String>,
|
||||
"params": {
|
||||
"data": origin,
|
||||
},
|
||||
});
|
||||
|
||||
let response = self.client.post(self.url.clone()).json(&req).send()?;
|
||||
|
||||
let res: RpcResponse<Origin> = response.json()?;
|
||||
|
||||
if res.error.is_some() {
|
||||
return Err(crate::Error::ApiError {
|
||||
msg: "Error in response".to_string(),
|
||||
details: to_string_pretty(&res.error.unwrap()).unwrap(),
|
||||
});
|
||||
}
|
||||
|
||||
Ok(res.clone().result.ok_or_else(|| crate::Error::ApiError {
|
||||
msg: "Invalid Object".to_string(),
|
||||
details: to_string_pretty(&res.clone()).unwrap(),
|
||||
})?)
|
||||
}
|
||||
|
||||
pub fn update_origin(&self, origin: &OriginForUpdate, origin_id: i64) -> Result<Origin> {
|
||||
let req = json!({
|
||||
"method": "update_origin",
|
||||
"id": None::<bool>,
|
||||
"params": {
|
||||
"id": origin_id,
|
||||
"data": origin,
|
||||
},
|
||||
});
|
||||
|
||||
let response = self.client.post(self.url.clone()).json(&req).send()?;
|
||||
|
||||
let res: RpcResponse<Origin> = response.json()?;
|
||||
|
||||
if res.error.is_some() {
|
||||
return Err(crate::Error::ApiError {
|
||||
msg: "Error in response".to_string(),
|
||||
details: to_string_pretty(&res.error.unwrap()).unwrap(),
|
||||
});
|
||||
}
|
||||
|
||||
Ok(res.clone().result.ok_or_else(|| crate::Error::ApiError {
|
||||
msg: "Invalid Object".to_string(),
|
||||
details: to_string_pretty(&res).unwrap(),
|
||||
})?)
|
||||
}
|
||||
|
||||
pub fn create_book(&self, book: &BookForCreate) -> Result<Book> {
|
||||
let req = json!({
|
||||
"method": "create_book",
|
||||
"id": None::<String>,
|
||||
"params": {
|
||||
"data": book,
|
||||
},
|
||||
});
|
||||
|
||||
let response = self.client.post(self.url.clone()).json(&req).send()?;
|
||||
|
||||
let res: RpcResponse<Book> = response.json()?;
|
||||
|
||||
if res.error.is_some() {
|
||||
return Err(crate::Error::ApiError {
|
||||
msg: "Error in response".to_string(),
|
||||
details: to_string_pretty(&res.error.unwrap()).unwrap(),
|
||||
});
|
||||
}
|
||||
|
||||
Ok(res.clone().result.ok_or_else(|| crate::Error::ApiError {
|
||||
msg: "Invalid Object".to_string(),
|
||||
details: to_string_pretty(&res).unwrap(),
|
||||
})?)
|
||||
}
|
||||
|
||||
pub fn update_book(&self, book: &BookForUpdate, id: i64) -> Result<Book> {
|
||||
let req = json!({
|
||||
"method": "update_book",
|
||||
"id": None::<String>,
|
||||
"params": {
|
||||
"id": id,
|
||||
"data": book,
|
||||
},
|
||||
});
|
||||
|
||||
let response = self.client.post(self.url.clone()).json(&req).send()?;
|
||||
|
||||
let res: RpcResponse<Book> = response.json()?;
|
||||
|
||||
if res.error.is_some() {
|
||||
return Err(crate::Error::ApiError {
|
||||
msg: "Error in response".to_string(),
|
||||
details: to_string_pretty(&res.error.unwrap()).unwrap(),
|
||||
});
|
||||
}
|
||||
|
||||
Ok(res.clone().result.ok_or_else(|| crate::Error::ApiError {
|
||||
msg: "Invalid Object".to_string(),
|
||||
details: to_string_pretty(&res).unwrap(),
|
||||
})?)
|
||||
}
|
||||
|
||||
pub fn create_chapter(&self, chapter: &ChapterForCreate) -> Result<Chapter> {
|
||||
let req = json!({
|
||||
"method": "create_chapter",
|
||||
"id": None::<String>,
|
||||
"params": {
|
||||
"data": chapter,
|
||||
},
|
||||
});
|
||||
|
||||
let response = self.client.post(self.url.clone()).json(&req).send()?;
|
||||
|
||||
let res: RpcResponse<Chapter> = response.json()?;
|
||||
|
||||
if res.error.is_some() {
|
||||
return Err(crate::Error::ApiError {
|
||||
msg: "Error in response".to_string(),
|
||||
details: to_string_pretty(&res.error.unwrap()).unwrap(),
|
||||
});
|
||||
}
|
||||
|
||||
Ok(res.clone().result.ok_or_else(|| crate::Error::ApiError {
|
||||
msg: "Invalid Object".to_string(),
|
||||
details: to_string_pretty(&res).unwrap(),
|
||||
})?)
|
||||
}
|
||||
|
||||
pub fn update_chapter(&self, chapter: &ChapterForUpdate, id: i64) -> Result<Chapter> {
|
||||
let req = json!({
|
||||
"method": "update_chapter",
|
||||
"id": None::<String>,
|
||||
"params": {
|
||||
"id": id,
|
||||
"data": chapter,
|
||||
},
|
||||
});
|
||||
|
||||
let response = self.client.post(self.url.clone()).json(&req).send()?;
|
||||
|
||||
let res: RpcResponse<Chapter> = response.json()?;
|
||||
|
||||
if res.error.is_some() {
|
||||
return Err(crate::Error::ApiError {
|
||||
msg: "Error in response".to_string(),
|
||||
details: to_string_pretty(&res.error.unwrap()).unwrap(),
|
||||
});
|
||||
}
|
||||
|
||||
Ok(res.clone().result.ok_or_else(|| crate::Error::ApiError {
|
||||
msg: "Invalid Object".to_string(),
|
||||
details: to_string_pretty(&res).unwrap(),
|
||||
})?)
|
||||
}
|
||||
}
|
||||
|
||||
impl Api for RpcApi {
|
||||
fn get_chapter_stubs_for_book(&self, book: &Book) -> Result<Vec<ChapterStub>> {
|
||||
let req = json!({
|
||||
"method": "list_chapter_stubs",
|
||||
"id": None::<bool>,
|
||||
"params": {
|
||||
"filters": {
|
||||
"book_id": book.id,
|
||||
}
|
||||
},
|
||||
});
|
||||
|
||||
let response = self.client.post(self.url.clone()).json(&req).send()?;
|
||||
|
||||
let res: RpcResponse<Vec<lib_core::model::chapter::ChapterStub>> =
|
||||
response.json()?;
|
||||
|
||||
if res.error.is_some() {
|
||||
return Err(crate::Error::ApiError {
|
||||
msg: "Error in response".to_string(),
|
||||
details: to_string_pretty(&res.error.unwrap()).unwrap(),
|
||||
});
|
||||
}
|
||||
|
||||
let res_clone = res.clone();
|
||||
Ok(res.result.ok_or_else(|| crate::Error::ApiError {
|
||||
msg: "Invalid Object".to_string(),
|
||||
details: to_string_pretty(&res_clone).unwrap(),
|
||||
})?)
|
||||
}
|
||||
|
||||
fn upsert_origin(&self, origin: &OriginForCreate) -> Result<Origin> {
|
||||
let get_origin_request_body = json!({
|
||||
"method": "list_origins",
|
||||
"id": None::<String>,
|
||||
"params": {
|
||||
"filters": {
|
||||
"name": origin.name,
|
||||
}
|
||||
},
|
||||
});
|
||||
|
||||
let get_origin_response = self
|
||||
.client
|
||||
.post(self.url.clone())
|
||||
.json(&get_origin_request_body)
|
||||
.send()?;
|
||||
|
||||
let get_origin_response_data: RpcResponse<Vec<Origin>> =
|
||||
get_origin_response.json()?;
|
||||
|
||||
if get_origin_response_data.error.is_some() {
|
||||
return Err(crate::Error::ApiError {
|
||||
msg: "Error in response".to_string(),
|
||||
details: to_string_pretty(&get_origin_response_data.error.unwrap())
|
||||
.unwrap(),
|
||||
});
|
||||
}
|
||||
|
||||
if let Some(origins) = get_origin_response_data.clone().result {
|
||||
match &origins.len() {
|
||||
0 => {
|
||||
return self.create_origin(origin);
|
||||
}
|
||||
1 => {
|
||||
let old_origin_update = OriginForUpdate {
|
||||
name: Some(origins[0].name.clone()),
|
||||
url: Some(origins[0].url.clone()),
|
||||
};
|
||||
let new_origin_update = origin.clone().into();
|
||||
|
||||
if new_origin_update != old_origin_update {
|
||||
return self.update_origin(&new_origin_update, origins[0].id);
|
||||
}
|
||||
Ok(origins[0].clone())
|
||||
}
|
||||
_ => Err(crate::Error::ApiError {
|
||||
msg: "Multiple origins found".to_string(),
|
||||
details: to_string_pretty(&get_origin_response_data).unwrap(),
|
||||
}),
|
||||
}
|
||||
} else {
|
||||
Err(crate::Error::ApiError {
|
||||
msg: "No result in response".to_string(),
|
||||
details: to_string_pretty(&get_origin_response_data).unwrap(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn upsert_book(&self, book: &BookForCreate) -> Result<Book> {
|
||||
println!("Upserting book: {}, {}", book.title, book.origin_book_url);
|
||||
let get_book_request_body = json!({
|
||||
"method": "list_books",
|
||||
"id": None::<String>,
|
||||
"params": {
|
||||
"filters": {
|
||||
"origin_id": book.origin_id,
|
||||
"origin_book_id": book.origin_book_id,
|
||||
}
|
||||
},
|
||||
});
|
||||
|
||||
let get_book_response = self
|
||||
.client
|
||||
.post(self.url.clone())
|
||||
.json(&get_book_request_body)
|
||||
.send()?;
|
||||
|
||||
let get_book_response_data: RpcResponse<Vec<Book>> =
|
||||
get_book_response.json()?;
|
||||
|
||||
if get_book_response_data.error.is_some() {
|
||||
return Err(crate::Error::ApiError {
|
||||
msg: "Error in response".to_string(),
|
||||
details: to_string_pretty(&get_book_response_data.error.unwrap())
|
||||
.unwrap(),
|
||||
});
|
||||
}
|
||||
|
||||
if let Some(books) = get_book_response_data.clone().result {
|
||||
match &books.len() {
|
||||
0 => self.create_book(book),
|
||||
1 => {
|
||||
let old_book_update = BookForUpdate {
|
||||
title: Some(books[0].title.clone()),
|
||||
summary: Some(books[0].summary.clone()),
|
||||
authors: Some(books[0].authors.clone()),
|
||||
genres: Some(books[0].genres.clone()),
|
||||
status: Some(books[0].status.clone()),
|
||||
cover: books[0].cover.clone(),
|
||||
origin_book_url: Some(books[0].origin_book_url.clone()),
|
||||
origin_book_id: Some(books[0].origin_book_id.clone()),
|
||||
origin_id: Some(books[0].origin_id),
|
||||
};
|
||||
let new_book_update = book.clone().into();
|
||||
|
||||
if new_book_update != old_book_update {
|
||||
return self.update_book(&new_book_update, books[0].id);
|
||||
}
|
||||
Ok(books[0].clone())
|
||||
}
|
||||
_ => Err(crate::Error::ApiError {
|
||||
msg: "Multiple books found".to_string(),
|
||||
details: to_string_pretty(&get_book_response_data).unwrap(),
|
||||
}),
|
||||
}
|
||||
} else {
|
||||
Err(crate::Error::ApiError {
|
||||
msg: "No result in response".to_string(),
|
||||
details: to_string_pretty(&get_book_response_data).unwrap(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn upsert_chapter(&self, chapter: &ChapterForCreate) -> Result<Chapter> {
|
||||
let get_chapter_request_body = json!({
|
||||
"method": "list_chapters",
|
||||
"id": None::<String>,
|
||||
"params": {
|
||||
"filters": {
|
||||
"book_id": chapter.book_id,
|
||||
"number": chapter.number,
|
||||
}
|
||||
},
|
||||
});
|
||||
|
||||
let get_chapter_response = self
|
||||
.client
|
||||
.post(self.url.clone())
|
||||
.json(&get_chapter_request_body)
|
||||
.send()?;
|
||||
|
||||
let get_chapter_response_data: RpcResponse<Vec<Chapter>> =
|
||||
get_chapter_response.json()?;
|
||||
|
||||
if get_chapter_response_data.error.is_some() {
|
||||
return Err(crate::Error::ApiError {
|
||||
msg: "Error in response".to_string(),
|
||||
details: to_string_pretty(&get_chapter_response_data.error.unwrap())
|
||||
.unwrap(),
|
||||
});
|
||||
}
|
||||
|
||||
if let Some(chapters) = get_chapter_response_data.clone().result {
|
||||
match &chapters.len() {
|
||||
0 => self.create_chapter(chapter),
|
||||
1 => {
|
||||
let old_chapter_update = ChapterForUpdate {
|
||||
title: Some(chapters[0].title.clone()),
|
||||
content: Some(chapters[0].content.clone()),
|
||||
};
|
||||
let new_chapter_update = chapter.clone().into();
|
||||
|
||||
if new_chapter_update != old_chapter_update {
|
||||
return self.update_chapter(&new_chapter_update, chapters[0].id);
|
||||
}
|
||||
Ok(chapters[0].clone())
|
||||
}
|
||||
_ => Err(crate::Error::ApiError {
|
||||
msg: "Multiple chapters found".to_string(),
|
||||
details: to_string_pretty(&get_chapter_response_data).unwrap(),
|
||||
}),
|
||||
}
|
||||
} else {
|
||||
Err(crate::Error::ApiError {
|
||||
msg: "No result in response".to_string(),
|
||||
details: to_string_pretty(&get_chapter_response_data).unwrap(),
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,12 +1,32 @@
|
||||
use derive_more::From;
|
||||
use serde::Serialize;
|
||||
use serde_with::serde_as;
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
|
||||
pub type Result<T> = core::result::Result<T, Error>;
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Debug, Serialize, From)]
|
||||
pub enum Error {
|
||||
#[from]
|
||||
ReqwestError(#[serde_as( as = "DisplayFromStr" )] reqwest::Error),
|
||||
ApiError{msg: String, details: String},
|
||||
InvalidDocument {
|
||||
msg: String,
|
||||
url: String,
|
||||
},
|
||||
InvalidElement {
|
||||
msg: String,
|
||||
url: String,
|
||||
},
|
||||
InvalidChapterTitle {
|
||||
msg: String,
|
||||
title: String,
|
||||
url: String,
|
||||
},
|
||||
OriginIdNotSet,
|
||||
#[from]
|
||||
MimeParsingError(#[serde_as( as = "DisplayFromStr")] data_url::mime::MimeParsingError)
|
||||
|
||||
// EntityNotFound {
|
||||
// entity: &'static str,
|
||||
// id: i64,
|
||||
|
||||
@@ -11,22 +11,24 @@ pub mod api;
|
||||
// re exports
|
||||
pub use error::{Error, Result};
|
||||
|
||||
use lib_core::model::book::BookForCreate;
|
||||
use lib_core::model::chapter::{ChapterForCreate, ChapterStub};
|
||||
use lib_core::model::origin::OriginForCreate;
|
||||
pub use lib_core::model::book::{BookForCreate, Book};
|
||||
pub use lib_core::model::chapter::{ChapterForCreate, ChapterStub, Chapter, ChapterStubForCreate};
|
||||
pub use lib_core::model::origin::{OriginForCreate, Origin};
|
||||
|
||||
pub trait Api {
|
||||
fn upsert_book(&self, book: &BookForCreate) -> Result<i64>;
|
||||
fn get_chapter_stubs_for_book(&self, book_id: &i64) -> Result<Vec<ChapterStub>>;
|
||||
fn upsert_chapter(&self, chapter: &ChapterForCreate) -> Result<i64>;
|
||||
fn upsert_origin(&self, origin: &OriginForCreate) -> Result<Origin>;
|
||||
fn upsert_book(&self, book: &BookForCreate) -> Result<Book>;
|
||||
fn get_chapter_stubs_for_book(&self, book_id: &Book) -> Result<Vec<ChapterStub>>;
|
||||
fn upsert_chapter(&self, chapter: &ChapterForCreate) -> Result<Chapter>;
|
||||
}
|
||||
|
||||
pub trait Source {
|
||||
fn origin(&self) -> OriginForCreate;
|
||||
fn set_origin_id(&mut self, origin_id: &i64);
|
||||
fn get_all_books(&self) -> Result<Vec<BookForCreate>>;
|
||||
fn get_book_chapter_stubs(
|
||||
fn get_book_chapters_stubs(
|
||||
&self,
|
||||
book: &BookForCreate,
|
||||
) -> Result<Vec<ChapterStub>>;
|
||||
fn get_full_chapter(&self, chapter: &ChapterStub) -> Result<ChapterForCreate>;
|
||||
book: &Book,
|
||||
) -> Result<Vec<ChapterStubForCreate>>;
|
||||
fn get_full_chapters(&self, stubs: Vec<ChapterStubForCreate>) -> Result<Vec<ChapterForCreate>>;
|
||||
}
|
||||
|
||||
@@ -1,35 +1,43 @@
|
||||
use crate::{Api, Result, Source};
|
||||
use lib_core::model::chapter::ChapterStub;
|
||||
use rayon::prelude::*;
|
||||
|
||||
pub struct Scraper<S: Source, A: Api> {
|
||||
pub struct Scraper<S: Source + Clone, A: Api + Sync + Send> {
|
||||
source: S,
|
||||
api: A,
|
||||
}
|
||||
|
||||
impl<T: Source, S: Api> Scraper<T, S> {
|
||||
pub fn new(source: T, api: S) -> Self {
|
||||
impl<S: Source + Clone, A: Api + Sync + Send> Scraper<S, A> {
|
||||
pub fn new(api: A, source: S) -> Self {
|
||||
Self { source, api }
|
||||
}
|
||||
|
||||
pub fn update(&self) -> Result<()> {
|
||||
pub fn origin_id(&self) -> Result<i64> {
|
||||
let origin = self.source.origin();
|
||||
Ok(self.api.upsert_origin(&origin)?.id)
|
||||
}
|
||||
|
||||
pub fn update(&mut self) -> Result<()> {
|
||||
self.source.set_origin_id(&self.origin_id()?);
|
||||
|
||||
let src_books = self.source.get_all_books()?;
|
||||
|
||||
for book in src_books {
|
||||
let book_id = self.api.upsert_book(&book)?;
|
||||
let chapter_stubs = self.api.get_chapter_stubs_for_book(&book_id)?;
|
||||
for src_book in src_books {
|
||||
let book = self.api.upsert_book(&src_book)?;
|
||||
let chapter_stubs = self.api.get_chapter_stubs_for_book(&book)?;
|
||||
let chapter_numbers =
|
||||
chapter_stubs.iter().map(|c| c.number).collect::<Vec<i64>>();
|
||||
let src_chapter_stubs = self.source.get_book_chapter_stubs(&book)?;
|
||||
let src_chapter_stubs = self.source.get_book_chapters_stubs(&book)?;
|
||||
|
||||
let missing_chapters = src_chapter_stubs
|
||||
.into_iter()
|
||||
.filter(|e| !chapter_numbers.contains(&e.number))
|
||||
.collect::<Vec<ChapterStub>>();
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
for chapter in missing_chapters {
|
||||
let full_chapter = self.source.get_full_chapter(&chapter)?;
|
||||
self.api.upsert_chapter(&full_chapter)?;
|
||||
}
|
||||
let full_chapters = self.source.get_full_chapters(missing_chapters)?;
|
||||
full_chapters
|
||||
.par_iter()
|
||||
.map(|f| self.api.upsert_chapter(f))
|
||||
.collect::<Result<Vec<_>>>()?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -1,5 +1,3 @@
|
||||
use std::result;
|
||||
|
||||
use serde::{
|
||||
de::{self, DeserializeOwned, Visitor},
|
||||
ser::SerializeStruct,
|
||||
|
||||
41
crates/services/scrapers/boxnovel/Cargo.toml
Normal file
41
crates/services/scrapers/boxnovel/Cargo.toml
Normal file
@@ -0,0 +1,41 @@
|
||||
[package]
|
||||
name = "scraper-boxnovel"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
# -- App Libs
|
||||
lib-core = { path = "../../../libs/lib-core"}
|
||||
lib-scraper = { path = "../../../libs/lib-scraper"}
|
||||
lib-utils = { path = "../../../libs/lib-utils"}
|
||||
# -- Async
|
||||
# tokio = { version = "1", features = ["full"] }
|
||||
# async-trait = "0.1"
|
||||
# # -- Json
|
||||
# serde = { version = "1", features = ["derive"] }
|
||||
# serde_json = "1"
|
||||
# serde_with = "3"
|
||||
# # -- Web
|
||||
# axum = {version = "0.7", features = ["macros"]}
|
||||
# tower-http = { version = "0.5", features = ["fs"] }
|
||||
# tower-cookies = "0.10"
|
||||
# # -- Tracing
|
||||
# tracing = "0.1"
|
||||
# tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
||||
# # -- Others
|
||||
# time = "0.3"
|
||||
# uuid = {version = "1", features = ["v4","fast-rng",]}
|
||||
# strum_macros = "0.25"
|
||||
# derive_more = {version = "1.0.0-beta", features = ["from"] }
|
||||
|
||||
# -- Scraping
|
||||
derive_more.workspace = true
|
||||
rayon.workspace = true
|
||||
reqwest = { version = "0.12.9", features = ["blocking"] }
|
||||
scraper = "0.20.0"
|
||||
url = "2.5.2"
|
||||
regex = "1.11.1"
|
||||
|
||||
|
||||
|
||||
[dev-dependencies]
|
||||
429
crates/services/scrapers/boxnovel/src/main.rs
Normal file
429
crates/services/scrapers/boxnovel/src/main.rs
Normal file
@@ -0,0 +1,429 @@
|
||||
use lib_core::model::chapter::ChapterStubForCreate;
|
||||
use lib_scraper::{
|
||||
Book, BookForCreate, ChapterForCreate, Error, OriginForCreate, Result, Source,
|
||||
};
|
||||
use rayon::prelude::*;
|
||||
use scraper::{Html, Selector};
|
||||
|
||||
fn main() {
|
||||
let api = lib_scraper::api::RpcApi::new(
|
||||
"http://127.0.0.1:8080/api/rpc".parse().unwrap(),
|
||||
);
|
||||
|
||||
let source = BoxnovelSource::new(
|
||||
"Boxnovel".to_string(),
|
||||
"https://boxnovel.com".to_string(),
|
||||
);
|
||||
|
||||
let mut scraper = lib_scraper::scraper::Scraper::new(api, source);
|
||||
let r = scraper.update();
|
||||
|
||||
println!("Finished: {:?}", r);
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct BoxnovelSource {
|
||||
// Source info
|
||||
origin_id: Option<i64>,
|
||||
origin: String,
|
||||
url: String,
|
||||
|
||||
// scraping stuff
|
||||
client: reqwest::blocking::Client,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
struct BoxnovelListingPage {
|
||||
book_urls: Vec<String>,
|
||||
next_url: Option<String>,
|
||||
}
|
||||
|
||||
impl BoxnovelSource {
|
||||
pub fn new(origin: String, url: String) -> Self {
|
||||
let client = reqwest::blocking::Client::new();
|
||||
Self {
|
||||
origin_id: None,
|
||||
origin,
|
||||
url,
|
||||
client,
|
||||
}
|
||||
}
|
||||
|
||||
fn get_book_listings(&self, start_url: String) -> Result<Vec<String>> {
|
||||
let mut url = Some(start_url);
|
||||
let mut listings: Vec<BoxnovelListingPage> = vec![];
|
||||
while let Some(u) = &url {
|
||||
println!("Getting url: {}", u);
|
||||
let res = self.client.get(u).send()?;
|
||||
let content = res.text()?;
|
||||
let listing = self.extract_boxnovel_listing(content, u.clone())?;
|
||||
url = listing.next_url.clone();
|
||||
listings.push(listing);
|
||||
}
|
||||
Ok(listings.into_iter().flat_map(|l| l.book_urls).collect())
|
||||
}
|
||||
|
||||
fn get_book(&self, url: &str) -> Result<BookForCreate> {
|
||||
println!("Getting book: {}", url);
|
||||
let res = self.client.get(url).send()?;
|
||||
let content = res.text()?;
|
||||
let book = self.extract_boxnovel_book(content, url.to_string())?;
|
||||
Ok(book)
|
||||
}
|
||||
|
||||
fn extract_boxnovel_book(
|
||||
&self,
|
||||
html: String,
|
||||
url: String,
|
||||
) -> Result<BookForCreate> {
|
||||
let document = Html::parse_document(&html);
|
||||
|
||||
let title = document
|
||||
.select(&Selector::parse(".post-title>h1").unwrap())
|
||||
.next()
|
||||
.ok_or(Error::InvalidDocument {
|
||||
msg: "title not found".to_string(),
|
||||
url: url.clone(),
|
||||
})?
|
||||
.text()
|
||||
.collect::<String>()
|
||||
.trim()
|
||||
.to_string();
|
||||
|
||||
let summary = document
|
||||
.select(&Selector::parse(".summary__content").unwrap())
|
||||
.next()
|
||||
.ok_or(Error::InvalidDocument {
|
||||
msg: "summary not found".to_string(),
|
||||
url: url.clone(),
|
||||
})
|
||||
.map(|e| e.text().collect::<String>())
|
||||
.unwrap_or("".to_string())
|
||||
.trim()
|
||||
.to_string();
|
||||
|
||||
let authors = document
|
||||
.select(&Selector::parse(".author-content").unwrap())
|
||||
.next()
|
||||
.ok_or(Error::InvalidDocument {
|
||||
msg: "authors not found".to_string(),
|
||||
url: url.clone(),
|
||||
})
|
||||
.map(|e| {
|
||||
e.text()
|
||||
.collect::<Vec<_>>()
|
||||
.join(" ")
|
||||
.split(",")
|
||||
.map(|s| s.trim().to_string())
|
||||
.collect()
|
||||
})
|
||||
.unwrap_or(vec![]);
|
||||
|
||||
let genres = document
|
||||
.select(&Selector::parse(".genres-content").unwrap())
|
||||
.next()
|
||||
.ok_or(Error::InvalidDocument {
|
||||
msg: "genres not found".to_string(),
|
||||
url: url.clone(),
|
||||
})
|
||||
.map(|e| {
|
||||
e.text()
|
||||
.collect::<Vec<_>>()
|
||||
.join(" ")
|
||||
.split(",")
|
||||
.map(|s| s.trim().to_string())
|
||||
.collect()
|
||||
})
|
||||
.unwrap_or(vec![]);
|
||||
|
||||
let status = document
|
||||
.select(&Selector::parse(".post-status>.post-content_item").unwrap())
|
||||
.filter(|e| {
|
||||
e.child_elements()
|
||||
.nth(0)
|
||||
.map(|t| {
|
||||
t.text().collect::<Vec<_>>().join(" ").trim().to_string()
|
||||
})
|
||||
.eq(&Some("Status".to_string()))
|
||||
})
|
||||
.nth(0)
|
||||
.ok_or(Error::InvalidDocument {
|
||||
msg: "status item not found".to_string(),
|
||||
url: url.clone(),
|
||||
})?
|
||||
.child_elements()
|
||||
.nth(1)
|
||||
.ok_or(Error::InvalidDocument {
|
||||
msg: "status element not found".to_string(),
|
||||
url: url.clone(),
|
||||
})?
|
||||
.text()
|
||||
.collect::<Vec<_>>()
|
||||
.join(" ")
|
||||
.trim()
|
||||
.to_lowercase();
|
||||
|
||||
let cover = match document
|
||||
.select(&Selector::parse(".summary_image>a>img").unwrap())
|
||||
.next()
|
||||
.ok_or(Error::InvalidDocument {
|
||||
msg: "cover image not found".to_string(),
|
||||
url: url.clone(),
|
||||
})
|
||||
.map(|e| {
|
||||
e.attr("src").ok_or(Error::InvalidElement {
|
||||
msg: "src in img missing".to_string(),
|
||||
url: url.clone(),
|
||||
})
|
||||
})
|
||||
.ok()
|
||||
.and_then(|i| i.ok())
|
||||
.map(|s| self.get_cover(s.to_string()))
|
||||
.transpose()
|
||||
{
|
||||
Ok(c) => c,
|
||||
Err(Error::InvalidDocument { .. }) => None,
|
||||
Err(e) => return Err(e),
|
||||
};
|
||||
|
||||
let origin_book_id = self.get_book_id_from_url(url.clone())?;
|
||||
|
||||
Ok(BookForCreate {
|
||||
title,
|
||||
summary,
|
||||
authors,
|
||||
genres,
|
||||
status,
|
||||
cover,
|
||||
origin_book_url: url.clone(),
|
||||
origin_book_id,
|
||||
origin_id: self.origin_id.clone().ok_or(Error::OriginIdNotSet)?,
|
||||
})
|
||||
}
|
||||
|
||||
fn get_book_id_from_url(&self, url: String) -> Result<String> {
|
||||
url.clone()
|
||||
.strip_suffix("/")
|
||||
.map(|s| s.to_string())
|
||||
.unwrap_or_else(|| url.clone())
|
||||
.split("/")
|
||||
.last()
|
||||
.ok_or(Error::InvalidDocument {
|
||||
msg: "url not in expected format".to_string(),
|
||||
url: url.clone(),
|
||||
})
|
||||
.map(|s| s.to_string())
|
||||
}
|
||||
|
||||
fn get_chapter_id_from_url(&self, url: String) -> Result<String> {
|
||||
url.clone()
|
||||
.strip_suffix("/")
|
||||
.map(|s| s.to_string())
|
||||
.unwrap_or_else(|| url.clone())
|
||||
.split("/")
|
||||
.last()
|
||||
.ok_or(Error::InvalidDocument {
|
||||
msg: "url not in expected format".to_string(),
|
||||
url: url.clone(),
|
||||
})
|
||||
.map(|s| s.to_string())
|
||||
}
|
||||
|
||||
fn get_cover(&self, url: String) -> Result<String> {
|
||||
let image_ending = url.split(".").last().ok_or(Error::InvalidElement {
|
||||
msg: "cover url not in expected format (coudn't extract file ending)"
|
||||
.to_string(),
|
||||
url: url.clone(),
|
||||
})?;
|
||||
|
||||
let res = self.client.get(url.clone()).send()?;
|
||||
|
||||
let cover_b64 = lib_utils::b64::b64u_encode(res.bytes()?);
|
||||
|
||||
let d_url = format!("data:image/{};base64,{}", image_ending, cover_b64);
|
||||
|
||||
Ok(d_url)
|
||||
}
|
||||
|
||||
fn extract_boxnovel_listing(
|
||||
&self,
|
||||
html: String,
|
||||
url: String,
|
||||
) -> Result<BoxnovelListingPage> {
|
||||
let document = Html::parse_document(&html);
|
||||
|
||||
let novel_link_elments = document
|
||||
.select(&Selector::parse(".post-title.font-title>h3>a").unwrap())
|
||||
.into_iter()
|
||||
.map(|e| {
|
||||
e.attr("href")
|
||||
.ok_or(Error::InvalidDocument {
|
||||
msg: "href in link missing".to_string(),
|
||||
url: url.clone(),
|
||||
})
|
||||
.map(|s| s.to_string())
|
||||
})
|
||||
.collect::<Result<Vec<String>>>()?;
|
||||
|
||||
let next_link = document
|
||||
.select(&Selector::parse(".nav-previous.float-left>a").unwrap())
|
||||
.next()
|
||||
.map(|e| {
|
||||
e.attr("href")
|
||||
.ok_or(Error::InvalidDocument {
|
||||
msg: "href in link missing".to_string(),
|
||||
url: url.clone(),
|
||||
})
|
||||
.map(|s| s.to_string())
|
||||
})
|
||||
.transpose()?;
|
||||
|
||||
Ok(BoxnovelListingPage {
|
||||
book_urls: novel_link_elments,
|
||||
next_url: next_link,
|
||||
})
|
||||
}
|
||||
|
||||
fn parse_chapter_stub(
|
||||
&self,
|
||||
index: usize,
|
||||
element: scraper::ElementRef,
|
||||
book: &Book,
|
||||
) -> Result<ChapterStubForCreate> {
|
||||
// let pattern = r"^(?i)(?P<blob>(?:(?:(?:Chapter))\s?(?:\d+)?(?:\s?(?:-END|-|:)*)\s))+(?P<title>.*)$";
|
||||
// let pattern = r"^(?i)(?P<blob>(?:(?:(?:Chap?ter))\s?(?:\d+)?(?:\s?(?:-END|\s*-\s*)*)\s?))+(?::|-)?\s?(?P<title>.*)$";
|
||||
// let pattern = r"^(?i)(?P<blob>(?:(?:(?:x*|v*)?(?:Chap?ter)|(?:Volume))?\s*(?:\d+)(?:\s?(?:-END|\s*-\s*)*)\s?))+(?::|-)?\s?(?P<title>.*)$";
|
||||
let pattern = r"^(?i)(?P<blob>(?:(?:(?:x*|v*)?Chap?ter|Volume|Extra)?\s*(?:\d+)?(?:\s?(?:-END|\s*-\s*)*)\s?))+(?::|-)?\s?(?P<title>.*)$";
|
||||
let re = regex::Regex::new(pattern).unwrap();
|
||||
let text = element.text().collect::<String>().trim().to_string();
|
||||
|
||||
let re_res = re.captures(&text).ok_or(Error::InvalidChapterTitle {
|
||||
msg: "chapter title not in expected format".to_string(),
|
||||
title: text.clone(),
|
||||
url: book.origin_book_url.clone(),
|
||||
})?;
|
||||
|
||||
let title = re_res
|
||||
.name("title")
|
||||
.ok_or(Error::InvalidChapterTitle {
|
||||
msg: "chapter title group missing".to_string(),
|
||||
title: text.clone(),
|
||||
url: book.origin_book_url.clone(),
|
||||
})?
|
||||
.as_str()
|
||||
.to_string();
|
||||
|
||||
let origin_chapter_url = element
|
||||
.attr("href")
|
||||
.ok_or(Error::InvalidElement {
|
||||
msg: "href in chapter element missing".to_string(),
|
||||
url: book.origin_book_url.clone(),
|
||||
})?
|
||||
.to_string();
|
||||
|
||||
let origin_chapter_id =
|
||||
self.get_chapter_id_from_url(origin_chapter_url.clone())?;
|
||||
|
||||
Ok(ChapterStubForCreate {
|
||||
book_id: book.id,
|
||||
title: title,
|
||||
volume: 0,
|
||||
number: index as i64 + 1,
|
||||
origin_chapter_url,
|
||||
origin_chapter_id,
|
||||
})
|
||||
}
|
||||
|
||||
fn get_full_chapter(
|
||||
&self,
|
||||
stub: &ChapterStubForCreate,
|
||||
) -> Result<ChapterForCreate> {
|
||||
println!("Getting chapter: {}", stub.origin_chapter_url);
|
||||
let res = self.client.get(stub.origin_chapter_url.clone()).send()?;
|
||||
let res_content = res.text()?;
|
||||
|
||||
let chapter_content = Html::parse_document(&res_content)
|
||||
.select(&Selector::parse(".reading-content>.text-left").unwrap())
|
||||
.next()
|
||||
.ok_or(Error::InvalidDocument {
|
||||
msg: "chapter content not found".to_string(),
|
||||
url: stub.origin_chapter_url.clone(),
|
||||
})
|
||||
.map(|e| e.inner_html().trim().to_string())
|
||||
.unwrap_or("".to_string());
|
||||
|
||||
Ok(ChapterForCreate {
|
||||
book_id: stub.book_id,
|
||||
volume: stub.volume,
|
||||
number: stub.number,
|
||||
title: stub.title.clone(),
|
||||
content: chapter_content,
|
||||
origin_chapter_url: stub.origin_chapter_url.clone(),
|
||||
origin_chapter_id: stub.origin_chapter_id.clone(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl Source for BoxnovelSource {
|
||||
fn origin(&self) -> OriginForCreate {
|
||||
OriginForCreate {
|
||||
name: self.origin.clone(),
|
||||
url: self.url.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
fn set_origin_id(&mut self, origin_id: &i64) {
|
||||
self.origin_id = Some(origin_id.clone());
|
||||
}
|
||||
|
||||
fn get_full_chapters(
|
||||
&self,
|
||||
stubs: Vec<ChapterStubForCreate>,
|
||||
) -> Result<Vec<ChapterForCreate>> {
|
||||
stubs
|
||||
.par_iter()
|
||||
.map(|stub| self.get_full_chapter(stub))
|
||||
.collect::<Result<Vec<_>>>()
|
||||
}
|
||||
|
||||
fn get_all_books(&self) -> Result<Vec<BookForCreate>> {
|
||||
let start_url = format!("{}/page/1", self.url);
|
||||
let book_urls = self.get_book_listings(start_url)?;
|
||||
let books = book_urls
|
||||
.clone()
|
||||
// .iter()
|
||||
// .take(20)
|
||||
// .collect::<Vec<_>>()
|
||||
.par_iter()
|
||||
.map(|url| self.get_book(url))
|
||||
.collect::<Result<Vec<BookForCreate>>>()?;
|
||||
Ok(books)
|
||||
}
|
||||
|
||||
fn get_book_chapters_stubs(
|
||||
&self,
|
||||
book: &Book,
|
||||
) -> Result<Vec<ChapterStubForCreate>> {
|
||||
println!("Getting chapter stubs for book: {}", book.origin_book_url);
|
||||
let content = self
|
||||
.client
|
||||
.post(format!(
|
||||
"{}/novel/{}/ajax/chapters/",
|
||||
self.url, book.origin_book_id
|
||||
))
|
||||
.send()?
|
||||
.text()?;
|
||||
|
||||
// the replace fixes an unescpaed <A> tag in the html https://bonxoanovel.com/novel/the-s-classes-that-i-raised/
|
||||
let document = Html::parse_document(&content.replace("<A>", "<A>"));
|
||||
|
||||
let stubs = document
|
||||
.select(&Selector::parse("li>a").unwrap())
|
||||
.rev()
|
||||
.enumerate()
|
||||
.map(|(i, e)| self.parse_chapter_stub(i, e, book))
|
||||
.collect::<Result<Vec<_>>>()?;
|
||||
|
||||
Ok(stubs)
|
||||
}
|
||||
}
|
||||
32
crates/services/scrapers/novelfull.com/Cargo.toml
Normal file
32
crates/services/scrapers/novelfull.com/Cargo.toml
Normal file
@@ -0,0 +1,32 @@
|
||||
[package]
|
||||
name = "scraper-boxnovel"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
# -- App Libs
|
||||
lib-core = { path = "../../../libs/lib-core"}
|
||||
lib-scraper = { path = "../../../libs/lib-scraper"}
|
||||
# -- Async
|
||||
# tokio = { version = "1", features = ["full"] }
|
||||
# async-trait = "0.1"
|
||||
# # -- Json
|
||||
# serde = { version = "1", features = ["derive"] }
|
||||
# serde_json = "1"
|
||||
# serde_with = "3"
|
||||
# # -- Web
|
||||
# axum = {version = "0.7", features = ["macros"]}
|
||||
# tower-http = { version = "0.5", features = ["fs"] }
|
||||
# tower-cookies = "0.10"
|
||||
# # -- Tracing
|
||||
# tracing = "0.1"
|
||||
# tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
||||
# # -- Others
|
||||
# time = "0.3"
|
||||
# uuid = {version = "1", features = ["v4","fast-rng",]}
|
||||
# strum_macros = "0.25"
|
||||
# derive_more = {version = "1.0.0-beta", features = ["from"] }
|
||||
|
||||
|
||||
|
||||
[dev-dependencies]
|
||||
65
crates/services/scrapers/novelfull.com/src/main.rs
Normal file
65
crates/services/scrapers/novelfull.com/src/main.rs
Normal file
@@ -0,0 +1,65 @@
|
||||
use lib_scraper::{Api, BookForCreate, OriginForCreate, Source};
|
||||
|
||||
fn main() {
|
||||
let api =
|
||||
lib_scraper::api::RpcApi::new("http://127.0.0.1:8080/api/rpc".parse().unwrap());
|
||||
|
||||
// let test_book = BookForCreate {
|
||||
// title: "Test Book".to_string(),
|
||||
// summary: "Test Summary".to_string(),
|
||||
// authors: vec!["Test Author".to_string()],
|
||||
// genres: vec!["Test Genre".to_string()],
|
||||
// status: "Ongoing".to_string(),
|
||||
// cover: None,
|
||||
// origin_id: 1,
|
||||
// origin_book_id: "test".to_string(),
|
||||
// origin_book_url: "https://boxnovel.com/novel/test".to_string(),
|
||||
// };
|
||||
|
||||
// api.get_chapter_stubs_for_book(&1).unwrap();
|
||||
// api.upsert_book(&test_book).unwrap();
|
||||
|
||||
let source = BoxnovelSource {
|
||||
origin: "Boxnovel".to_string(),
|
||||
url: "https://boxnovel.com".to_string(),
|
||||
};
|
||||
|
||||
let scraper = lib_scraper::scraper::Scraper::new(api, source);
|
||||
|
||||
scraper.update().unwrap();
|
||||
println!("Hello, world!");
|
||||
}
|
||||
|
||||
struct BoxnovelSource {
|
||||
origin: String,
|
||||
url: String,
|
||||
}
|
||||
|
||||
impl Source for BoxnovelSource {
|
||||
fn origin(&self) -> OriginForCreate {
|
||||
OriginForCreate {
|
||||
name: self.origin.clone(),
|
||||
url: self.url.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
fn get_full_chapter(
|
||||
&self,
|
||||
stub: &lib_core::model::chapter::ChapterStub,
|
||||
) -> lib_scraper::Result<lib_core::model::chapter::ChapterForCreate> {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
fn get_all_books(
|
||||
&self,
|
||||
) -> lib_scraper::Result<Vec<lib_core::model::book::BookForCreate>> {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
fn get_book_chapter_stubs(
|
||||
&self,
|
||||
book: &lib_core::model::book::BookForCreate,
|
||||
) -> lib_scraper::Result<Vec<lib_core::model::chapter::ChapterStub>> {
|
||||
unimplemented!()
|
||||
}
|
||||
}
|
||||
@@ -1,5 +1,5 @@
|
||||
[package]
|
||||
name = "web-server"
|
||||
name = "web-api"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
@@ -9,7 +9,7 @@ use serde::Serialize;
|
||||
use serde_json::{json, Value};
|
||||
use serde_with::skip_serializing_none;
|
||||
use time::Duration;
|
||||
use tracing::debug;
|
||||
use tracing::{debug, info};
|
||||
|
||||
pub async fn log_request(
|
||||
http_method: Method,
|
||||
@@ -54,7 +54,7 @@ pub async fn log_request(
|
||||
error_data,
|
||||
};
|
||||
|
||||
debug!("REQUEST LOG LINE:\n{}", json!(log_line));
|
||||
info!("REQUEST LOG LINE:\n{}", json!(log_line));
|
||||
|
||||
// TODO - Send to cloud-watch.
|
||||
|
||||
@@ -3,7 +3,7 @@ use axum::http::StatusCode;
|
||||
use axum::response::{IntoResponse, Response};
|
||||
use derive_more::From;
|
||||
use lib_auth::{pwd, token};
|
||||
use lib_core::model;
|
||||
use lib_core::model::{self, Validation};
|
||||
use serde::Serialize;
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
use std::sync::Arc;
|
||||
@@ -94,19 +94,38 @@ impl Error {
|
||||
CtxExt(_) => (StatusCode::FORBIDDEN, ClientError::NO_AUTH),
|
||||
|
||||
// -- Model
|
||||
Model(model::Error::ValidationError(v)) => {
|
||||
(
|
||||
StatusCode::BAD_REQUEST,
|
||||
ClientError::INVALID_FORMAT(v.to_string()),
|
||||
)
|
||||
}
|
||||
Model(model::Error::EntityNotFound { entity, id }) => (
|
||||
StatusCode::BAD_REQUEST,
|
||||
ClientError::ENTITY_NOT_FOUND { entity, id: *id },
|
||||
),
|
||||
|
||||
// Rpc Error
|
||||
Rpc(lib_rpc::Error::SerdeJson(e)) => {
|
||||
(
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
ClientError::InvalidFormat(e.to_string()),
|
||||
ClientError::INVALID_FORMAT(e.to_string()),
|
||||
)
|
||||
},
|
||||
|
||||
Rpc(lib_rpc::Error::Model(lib_core::model::Error::ValidationError(v))) => {
|
||||
(
|
||||
StatusCode::BAD_REQUEST,
|
||||
ClientError::INVALID_FORMAT(v.to_string()),
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
// -- Fallback.
|
||||
// ReqStampNotInResponseExt | Pwd(_) | Token(_) | SerdeJson(_) | Rpc(_) => (
|
||||
// StatusCode::INTERNAL_SERVER_ERROR,
|
||||
// ClientError::SERVICE_ERROR,
|
||||
// ),
|
||||
_ => (
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
ClientError::SERVICE_ERROR,
|
||||
@@ -121,9 +140,10 @@ impl Error {
|
||||
pub enum ClientError {
|
||||
LOGIN_FAIL,
|
||||
NO_AUTH,
|
||||
InvalidFormat(String),
|
||||
INVALID_FORMAT(String),
|
||||
ENTITY_NOT_FOUND { entity: &'static str, id: i64 },
|
||||
|
||||
SERVICE_ERROR,
|
||||
TEST,
|
||||
}
|
||||
// endregion: --- Client Error
|
||||
4
migrations/20241025175122_init.down.sql
Normal file
4
migrations/20241025175122_init.down.sql
Normal file
@@ -0,0 +1,4 @@
|
||||
-- Add down migration script here
|
||||
DROP TABLE chapters;
|
||||
DROP TABLE books;
|
||||
DROP TABLE origins;
|
||||
41
migrations/20241025175122_init.up.sql
Normal file
41
migrations/20241025175122_init.up.sql
Normal file
@@ -0,0 +1,41 @@
|
||||
-- Add up migration script here
|
||||
CREATE TABLE origins (
|
||||
id BIGINT GENERATED BY DEFAULT AS IDENTITY PRIMARY KEY,
|
||||
name TEXT NOT NULL,
|
||||
url TEXT NOT NULL,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
UNIQUE (name)
|
||||
);
|
||||
|
||||
CREATE TABLE books (
|
||||
id BIGINT GENERATED BY DEFAULT AS IDENTITY PRIMARY KEY,
|
||||
title TEXT NOT NULL,
|
||||
summary TEXT NOT NULL,
|
||||
authors TEXT[] NOT NULL,
|
||||
genres TEXT[] NOT NULL,
|
||||
status TEXT NOT NULL,
|
||||
cover TEXT,
|
||||
origin_book_url TEXT NOT NULL,
|
||||
origin_book_id TEXT NOT NULL,
|
||||
origin_id BIGINT NOT NULL,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
FOREIGN KEY (origin_id) REFERENCES origins(id) ON DELETE CASCADE,
|
||||
UNIQUE (origin_id, origin_book_id)
|
||||
);
|
||||
|
||||
CREATE TABLE chapters (
|
||||
id BIGINT GENERATED BY DEFAULT AS IDENTITY PRIMARY KEY,
|
||||
book_id BIGINT NOT NULL,
|
||||
volume BIGINT NOT NULL,
|
||||
number BIGINT NOT NULL,
|
||||
title TEXT NOT NULL,
|
||||
content TEXT NOT NULL,
|
||||
origin_chapter_url TEXT NOT NULL,
|
||||
origin_chapter_id TEXT NOT NULL,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
FOREIGN KEY (book_id) REFERENCES books(id) ON DELETE CASCADE,
|
||||
UNIQUE (book_id, volume, number)
|
||||
);
|
||||
2
migrations/20241105191622_add_book_search.down.sql
Normal file
2
migrations/20241105191622_add_book_search.down.sql
Normal file
@@ -0,0 +1,2 @@
|
||||
-- Add down migration script here
|
||||
ALTER TABLE books DROP COLUMN searchable;
|
||||
2
migrations/20241105191622_add_book_search.up.sql
Normal file
2
migrations/20241105191622_add_book_search.up.sql
Normal file
@@ -0,0 +1,2 @@
|
||||
-- Add up migration script here
|
||||
ALTER TABLE books ADD COLUMN searchable tsvector generated always as (to_tsvector('english', title || ' ' || summary)) STORED;
|
||||
Reference in New Issue
Block a user