Skip to content

Commit c5cc555

Browse files
Add:isValidTitle function and unit testing
1 parent 3ad99a1 commit c5cc555

File tree

3 files changed

+216
-112
lines changed

3 files changed

+216
-112
lines changed

.vscode/settings.json

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
{
2+
"files.associations": {
3+
"string": "cpp",
4+
"__bit_reference": "cpp",
5+
"__bits": "cpp",
6+
"__config": "cpp",
7+
"__debug": "cpp",
8+
"__errc": "cpp",
9+
"__hash_table": "cpp",
10+
"__locale": "cpp",
11+
"__mutex_base": "cpp",
12+
"__node_handle": "cpp",
13+
"__split_buffer": "cpp",
14+
"__threading_support": "cpp",
15+
"__tree": "cpp",
16+
"__tuple": "cpp",
17+
"__verbose_abort": "cpp",
18+
"array": "cpp",
19+
"atomic": "cpp",
20+
"bit": "cpp",
21+
"bitset": "cpp",
22+
"cctype": "cpp",
23+
"clocale": "cpp",
24+
"cmath": "cpp",
25+
"cstdarg": "cpp",
26+
"cstddef": "cpp",
27+
"cstdint": "cpp",
28+
"cstdio": "cpp",
29+
"cstdlib": "cpp",
30+
"cstring": "cpp",
31+
"ctime": "cpp",
32+
"cwchar": "cpp",
33+
"cwctype": "cpp",
34+
"exception": "cpp",
35+
"initializer_list": "cpp",
36+
"ios": "cpp",
37+
"iosfwd": "cpp",
38+
"iostream": "cpp",
39+
"istream": "cpp",
40+
"limits": "cpp",
41+
"locale": "cpp",
42+
"map": "cpp",
43+
"memory": "cpp",
44+
"mutex": "cpp",
45+
"new": "cpp",
46+
"optional": "cpp",
47+
"ostream": "cpp",
48+
"ratio": "cpp",
49+
"sstream": "cpp",
50+
"stdexcept": "cpp",
51+
"streambuf": "cpp",
52+
"string_view": "cpp",
53+
"system_error": "cpp",
54+
"tuple": "cpp",
55+
"type_traits": "cpp",
56+
"typeinfo": "cpp",
57+
"unordered_map": "cpp",
58+
"variant": "cpp",
59+
"vector": "cpp",
60+
"chrono": "cpp",
61+
"compare": "cpp",
62+
"algorithm": "cpp",
63+
"regex": "cpp"
64+
}
65+
}

src/zimwriterfs/zimcreatorfs.cpp

Lines changed: 85 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -19,34 +19,34 @@
1919
*/
2020

2121
#include "zimcreatorfs.h"
22-
#include "../tools.h"
23-
#include "tools.h"
2422

25-
#include <fstream>
2623
#include <dirent.h>
24+
#include <limits.h>
2725
#include <sys/stat.h>
28-
#include <regex>
2926
#include <unistd.h>
30-
#include <limits.h>
27+
3128
#include <cassert>
29+
#include <fstream>
30+
#include <regex>
31+
32+
#include "../tools.h"
33+
#include "tools.h"
3234

33-
void parse_redirectArticles(std::istream& in_stream, redirect_handler handler) {
35+
void parse_redirectArticles(std::istream& in_stream, redirect_handler handler)
36+
{
3437
std::string line;
3538
int line_number = 1;
3639
while (std::getline(in_stream, line)) {
3740
std::regex line_regex("^([^\\t]+)\\t([^\\t]+)\\t([^\\t]+)$");
3841
std::smatch matches;
3942
if (!std::regex_search(line, matches, line_regex) || matches.size() != 4) {
40-
throw std::runtime_error(
41-
Formatter() << "Invalid line #" << line_number << " : '" << line << "'"
42-
);
43+
throw std::runtime_error(Formatter() << "Invalid line #" << line_number
44+
<< " : '" << line << "'");
4345
}
4446

45-
Redirect redirect = {
46-
.path= matches[1].str(),
47-
.title = matches[2].str(),
48-
.target = matches[3].str()
49-
};
47+
Redirect redirect = {.path = matches[1].str(),
48+
.title = matches[2].str(),
49+
.target = matches[3].str()};
5050
handler(redirect);
5151
++line_number;
5252
}
@@ -55,14 +55,14 @@ void parse_redirectArticles(std::istream& in_stream, redirect_handler handler) {
5555
bool isVerbose();
5656

5757
ZimCreatorFS::ZimCreatorFS(std::string _directoryPath)
58-
: directoryPath(_directoryPath)
58+
: directoryPath(_directoryPath)
5959
{
6060
char buf[PATH_MAX];
6161

6262
if (realpath(directoryPath.c_str(), buf) != buf) {
63-
throw std::invalid_argument(
64-
Formatter() << "Unable to canonicalize HTML directory path "
65-
<< directoryPath << ": " << strerror(errno));
63+
throw std::invalid_argument(Formatter()
64+
<< "Unable to canonicalize HTML directory path "
65+
<< directoryPath << ": " << strerror(errno));
6666
}
6767

6868
canonical_basedir = buf;
@@ -74,17 +74,13 @@ void ZimCreatorFS::add_redirectArticles_from_file(const std::string& path)
7474

7575
in_stream.open(path.c_str());
7676
try {
77-
parse_redirectArticles(in_stream,
78-
[this](Redirect redirect) {
79-
this->addRedirection(
80-
redirect.path,
81-
redirect.title,
82-
redirect.target,
83-
{{zim::writer::HintKeys::FRONT_ARTICLE, 1}}
84-
);
85-
}
86-
);
87-
} catch(const std::runtime_error& e) {
77+
parse_redirectArticles(in_stream, [this](Redirect redirect) {
78+
this->addRedirection(redirect.path,
79+
redirect.title,
80+
redirect.target,
81+
{{zim::writer::HintKeys::FRONT_ARTICLE, 1}});
82+
});
83+
} catch (const std::runtime_error& e) {
8884
std::cerr << e.what() << "\nin redirect file " << path << std::endl;
8985
in_stream.close();
9086
exit(1);
@@ -171,14 +167,14 @@ void ZimCreatorFS::visitDirectory(const std::string& path)
171167

172168
void ZimCreatorFS::addFile(const std::string& path)
173169
{
174-
auto url = path.substr(directoryPath.size()+1);
170+
auto url = path.substr(directoryPath.size() + 1);
175171
auto mimetype = getMimeTypeForFile(directoryPath, url);
176172
auto title = std::string{};
177173
zim::writer::Hints hints;
178174

179175
std::shared_ptr<zim::writer::Item> item;
180-
if ( mimetype.find("text/html") != std::string::npos
181-
|| mimetype.find("text/css") != std::string::npos) {
176+
if (mimetype.find("text/html") != std::string::npos
177+
|| mimetype.find("text/css") != std::string::npos) {
182178
auto content = getFileContent(path);
183179

184180
if (mimetype.find("text/html") != std::string::npos) {
@@ -193,14 +189,17 @@ void ZimCreatorFS::addFile(const std::string& path)
193189
adaptCss(content, url);
194190
}
195191

196-
item = zim::writer::StringItem::create(url, mimetype, title, hints, content);
192+
item
193+
= zim::writer::StringItem::create(url, mimetype, title, hints, content);
197194
} else {
198-
item = std::make_shared<zim::writer::FileItem>(url, mimetype, title, hints, path);
195+
item = std::make_shared<zim::writer::FileItem>(
196+
url, mimetype, title, hints, path);
199197
}
200198
addItem(item);
201199
}
202200

203-
void ZimCreatorFS::processSymlink(const std::string& curdir, const std::string& symlink_path)
201+
void ZimCreatorFS::processSymlink(const std::string& curdir,
202+
const std::string& symlink_path)
204203
{
205204
/* #102 Links can be 3 different types:
206205
* - dandling (not pointing to a valid file)
@@ -211,26 +210,28 @@ void ZimCreatorFS::processSymlink(const std::string& curdir, const std::string&
211210
if (realpath(symlink_path.c_str(), resolved) != resolved) {
212211
// looping symlinks also fall here: Too many levels of symbolic links
213212
// It also handles dangling symlink: No such file or directory
214-
std::cerr << "Unable to resolve symlink " << symlink_path
215-
<< ": " << strerror(errno) << std::endl;
213+
std::cerr << "Unable to resolve symlink " << symlink_path << ": "
214+
<< strerror(errno) << std::endl;
216215
return;
217216
}
218217

219218
if (isDirectory(resolved)) {
220-
std::cerr << "Skip symlink " << symlink_path
221-
<< ": points to a directory" << std::endl;
219+
std::cerr << "Skip symlink " << symlink_path << ": points to a directory"
220+
<< std::endl;
222221
return;
223222
}
224223

225-
if (strncmp(canonical_basedir.c_str(), resolved, canonical_basedir.size()) != 0
224+
if (strncmp(canonical_basedir.c_str(), resolved, canonical_basedir.size())
225+
!= 0
226226
|| resolved[canonical_basedir.size()] != '/') {
227227
std::cerr << "Skip symlink " << symlink_path
228228
<< ": points outside of HTML directory" << std::endl;
229229
return;
230230
}
231231

232232
std::string source_url = symlink_path.substr(directoryPath.size() + 1);
233-
std::string target_url = std::string(resolved).substr(canonical_basedir.size() + 1);
233+
std::string target_url
234+
= std::string(resolved).substr(canonical_basedir.size() + 1);
234235
addRedirection(source_url, "", target_url);
235236
}
236237

@@ -256,11 +257,34 @@ inline std::string removeLocalTagAndParameters(const std::string& url)
256257

257258
struct GumboOutputDestructor {
258259
GumboOutputDestructor(GumboOutput* output) : output(output) {}
259-
~GumboOutputDestructor() { gumbo_destroy_output(&kGumboDefaultOptions, output); }
260+
~GumboOutputDestructor()
261+
{
262+
gumbo_destroy_output(&kGumboDefaultOptions, output);
263+
}
260264
GumboOutput* output;
261265
};
262266

263-
std::string ZimCreatorFS::parseAndAdaptHtml(std::string& data, std::string& title, const std::string& url)
267+
inline bool isValidTitle(const std::string& title, size_t min_length)
268+
{
269+
static const std::regex numeric_regex("[0-9]+");
270+
static const std::regex alphanumeric_regex("[a-zA-Z0-9]");
271+
272+
/* 1. Ensure the title has at least 3 characters */
273+
bool isLongEnough = title.length() >= min_length;
274+
275+
/* 2. Ensure the title is not only numeric */
276+
bool isNotNumeric = !std::regex_match(title, numeric_regex);
277+
278+
/* 3. Ensure the title contains at least one alphanumeric character
279+
* (not all special chars) */
280+
bool hasAlphanumeric = std::regex_search(title, alphanumeric_regex);
281+
282+
return isLongEnough && isNotNumeric && hasAlphanumeric;
283+
}
284+
285+
std::string ZimCreatorFS::parseAndAdaptHtml(std::string& data,
286+
std::string& title,
287+
const std::string& url)
264288
{
265289
GumboOutput* output = gumbo_parse(data.c_str());
266290
GumboOutputDestructor outputDestructor(output);
@@ -310,7 +334,8 @@ std::string ZimCreatorFS::parseAndAdaptHtml(std::string& data, std::string& titl
310334
auto redirectUrl = computeAbsolutePath(url, decodeUrl(targetUrl));
311335
auto redirectUrlPath = directoryPath + "/" + redirectUrl;
312336
if (!fileExists(redirectUrlPath)) {
313-
throw std::runtime_error("'" + url + "' HTML redirection target path '"
337+
throw std::runtime_error("'" + url
338+
+ "' HTML redirection target path '"
314339
+ redirectUrlPath + "' doesn't exist.");
315340
}
316341
return redirectUrl;
@@ -331,29 +356,32 @@ std::string ZimCreatorFS::parseAndAdaptHtml(std::string& data, std::string& titl
331356
std::replace(title.begin(), title.end(), '_', ' ');
332357
}
333358
}
359+
// Extra validation the title
360+
if (!isValidTitle(title, 3)) {
361+
std::cerr << "Warning: Generated title is invalid for URL: " << url
362+
<< std::endl;
363+
title = "Unknown title"; // or some other default title
364+
}
334365
}
335366
return "";
336367
}
337368

338-
void ZimCreatorFS::adaptCss(std::string& data, const std::string& url) {
369+
void ZimCreatorFS::adaptCss(std::string& data, const std::string& url)
370+
{
339371
/* Rewrite url() values in the CSS */
340372
size_t startPos = 0;
341373
size_t endPos = 0;
342374
std::string targetUrl;
343375

344376
while ((startPos = data.find("url(", endPos))
345377
&& startPos != std::string::npos) {
346-
347378
/* URL delimiters */
348379
endPos = data.find(")", startPos);
349-
startPos = startPos + (data[startPos + 4] == '\''
350-
|| data[startPos + 4] == '"'
351-
? 5
352-
: 4);
353-
endPos = endPos - (data[endPos - 1] == '\''
354-
|| data[endPos - 1] == '"'
355-
? 1
356-
: 0);
380+
startPos
381+
= startPos
382+
+ (data[startPos + 4] == '\'' || data[startPos + 4] == '"' ? 5 : 4);
383+
endPos = endPos
384+
- (data[endPos - 1] == '\'' || data[endPos - 1] == '"' ? 1 : 0);
357385
targetUrl = data.substr(startPos, endPos - startPos);
358386
std::string startDelimiter = data.substr(startPos - 1, 1);
359387
std::string endDelimiter = data.substr(endPos, 1);
@@ -385,9 +413,9 @@ void ZimCreatorFS::adaptCss(std::string& data, const std::string& url) {
385413
data,
386414
startDelimiter + targetUrl + endDelimiter,
387415
startDelimiter + "data:" + mimeType + ";base64,"
388-
+ base64_encode(reinterpret_cast<const unsigned char*>(
389-
fontContent.c_str()),
390-
fontContent.length())
416+
+ base64_encode(
417+
reinterpret_cast<const unsigned char*>(fontContent.c_str()),
418+
fontContent.length())
391419
+ endDelimiter);
392420
} catch (...) {
393421
}

0 commit comments

Comments
 (0)