Author: Manos Pitsidianakis [manos@pitsidianak.is]
Hash: ae96038fbf5884d5f4e52d0b0488dedb6e5f4050
Timestamp: Thu, 11 Apr 2024 18:15:47 +0000 (5 months ago)

+16 -147 +/-13 browse
Make unicode-segmentation a hard dependency
Make unicode-segmentation a hard dependency

meli/melib are UTF8 software, so we should have proper Unicode support.

A compile-time env var is added, `UNICODE_REGENERATE_TABLES` to force
network access and rebuild the cached unicode tables.

Signed-off-by: Manos Pitsidianakis <manos@pitsidianak.is>
1diff --git a/BUILD.md b/BUILD.md
2index ed45cf8..dfaabe2 100644
3--- a/BUILD.md
4+++ b/BUILD.md
5 @@ -3,7 +3,7 @@
6 For a quick start, build and install locally:
7
8 ```sh
9- PREFIX=~/.local make install
10+ PREFIX=~/.local make install
11 ```
12
13 Available subcommands for `make` are listed with `make help`.
14 @@ -34,6 +34,9 @@ Some functionality is held behind "feature gates", or compile-time flags. The fo
15 Since it's actual use in the code is very limited, it is not recommended to use this (off by default).
16 - `static` and `*-static` bundle C libraries in dependencies so that you don't need them installed in your system (on by default).
17
18+ Though not a feature, the presence of the environment variable `UNICODE_REGENERATE_TABLES` in compile-time of the `melib` crate will force the regeneration of unicode tables.
19+ Otherwise the tables are included with the source code, and there's no real reason to regenerate them unless you intend to modify the code or update to a new Unicode version.
20+
21 ## Build Debian package (*deb*)
22
23 Building with Debian's packaged cargo might require the installation of these two packages: `librust-openssl-sys-dev librust-libdbus-sys-dev`
24 diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml
25index 459b6b7..6eb4324 100644
26--- a/fuzz/Cargo.toml
27+++ b/fuzz/Cargo.toml
28 @@ -14,10 +14,7 @@ path = "fuzz_targets/envelope_parse.rs"
29
30 [dependencies]
31 libfuzzer-sys = "0.3"
32-
33- [dependencies.melib]
34- path = "../melib"
35- features = ["unicode-algorithms"]
36+ melib = { path = "../melib" }
37
38 # Prevent this from interfering with workspaces
39 [workspace]
40 diff --git a/meli/Cargo.toml b/meli/Cargo.toml
41index 84b57f4..ea3d866 100644
42--- a/meli/Cargo.toml
43+++ b/meli/Cargo.toml
44 @@ -31,7 +31,7 @@ indexmap = { version = "^1.6", features = ["serde-1"] }
45 libc = { version = "0.2.125", default-features = false, features = ["extra_traits"] }
46 libz-sys = { version = "1.1", features = ["static"], optional = true }
47 linkify = { version = "^0.8", default-features = false }
48- melib = { path = "../melib", version = "0.8.5-rc.3", features = ["unicode-algorithms"] }
49+ melib = { path = "../melib", version = "0.8.5-rc.3", features = [] }
50 nix = { version = "0.27", default-features = false, features = ["signal", "poll", "term", "ioctl", "process"] }
51 notify = { version = "4.0.1", default-features = false } # >:c
52 num_cpus = "1.12.0"
53 diff --git a/melib/Cargo.toml b/melib/Cargo.toml
54index d5884bd..b3ad509 100644
55--- a/melib/Cargo.toml
56+++ b/melib/Cargo.toml
57 @@ -50,7 +50,7 @@ serde_path_to_error = { version = "0.1" }
58 smallvec = { version = "^1.5.0", features = ["serde"] }
59 smol = "1.0.0"
60 socket2 = { version = "0.5", features = [] }
61- unicode-segmentation = { version = "1.2.1", default-features = false, optional = true }
62+ unicode-segmentation = { version = "1.2.1", default-features = false }
63 url = { version = "2.4", optional = true }
64 uuid = { version = "^1", features = ["serde", "v4", "v5"] }
65 xdg = "2.1.0"
66 @@ -77,9 +77,6 @@ sqlite3 = ["rusqlite"]
67 sqlite3-static = ["sqlite3", "rusqlite/bundled-full"]
68 tls = ["native-tls"]
69 tls-static = ["tls", "native-tls/vendored"]
70- text-processing = []
71- unicode-algorithms = ["text-processing", "unicode-segmentation"]
72- unicode-algorithms-cached = ["text-processing", "unicode-segmentation"]
73 vcard = []
74
75 [build-dependencies]
76 diff --git a/melib/README.md b/melib/README.md
77index 36fe3b7..b3cbe22 100644
78--- a/melib/README.md
79+++ b/melib/README.md
80 @@ -22,24 +22,6 @@ Library for handling mail.
81 |------------------------------|-------------------------------------|--------------------------|
82 | `sqlite` | `rusqlite` | Used in IMAP cache. |
83 |------------------------------|-------------------------------------|--------------------------|
84- | `unicode-algorithms` | `unicode-segmentation` | Linebreaking algo etc |
85- | | | For a fresh clean build, |
86- | | | Network access is |
87- | | | required to fetch data |
88- | | | from Unicode's website. |
89- |------------------------------|-------------------------------------|--------------------------|
90- | `unicode-algorithms-cached` | `unicode-segmentation` | Linebreaking algo etc |
91- | | | but it uses a cached |
92- | | | version of Unicode data |
93- | | | which might be stale. |
94- | | | |
95- | | | Use this feature instead |
96- | | | of the previous one for |
97- | | | building without network |
98- | | | access. |
99- |------------------------------|-------------------------------------|--------------------------|
100- | `unicode-algorithms` | `unicode-segmentation` | |
101- |------------------------------|-------------------------------------|--------------------------|
102 | `vcard` | | vcard parsing |
103 |------------------------------|-------------------------------------|--------------------------|
104 | `gpgme` | | GPG use with libgpgme |
105 diff --git a/melib/build.rs b/melib/build.rs
106index b9a0dbd..348ad20 100644
107--- a/melib/build.rs
108+++ b/melib/build.rs
109 @@ -21,15 +21,14 @@
110
111 #![allow(clippy::needless_range_loop)]
112
113- #[cfg(any(feature = "unicode-algorithms", feature = "unicode-algorithms-cached"))]
114 include!("src/text/types.rs");
115
116 fn main() -> Result<(), std::io::Error> {
117- #[cfg(any(feature = "unicode-algorithms", feature = "unicode-algorithms-cached"))]
118 {
119 const MOD_PATH: &str = "src/text/tables.rs";
120+ println!("cargo:rerun-if-env-changed=UNICODE_REGENERATE_TABLES");
121 println!("cargo:rerun-if-changed=build.rs");
122- println!("cargo:rerun-if-changed={}", MOD_PATH);
123+ println!("cargo:rerun-if-changed={MOD_PATH}");
124 /* Line break tables */
125 use std::{
126 fs::File,
127 @@ -54,7 +53,7 @@ fn main() -> Result<(), std::io::Error> {
128 );
129 return Ok(());
130 }
131- if cfg!(feature = "unicode-algorithms-cached") {
132+ if std::env::var("UNICODE_REGENERATE_TABLES").is_err() {
133 const CACHED_MODULE: &[u8] = include_bytes!(concat!("./src/text/tables.rs.gz"));
134
135 let mut gz = GzDecoder::new(CACHED_MODULE);
136 diff --git a/melib/src/email/compose/mime.rs b/melib/src/email/compose/mime.rs
137index 061e57e..6b11acb 100644
138--- a/melib/src/email/compose/mime.rs
139+++ b/melib/src/email/compose/mime.rs
140 @@ -20,14 +20,12 @@
141 */
142
143 use super::*;
144- #[cfg(feature = "text-processing")]
145 use crate::text::grapheme_clusters::TextProcessing;
146
147 pub fn encode_header(value: &str) -> String {
148 let mut ret = String::with_capacity(value.len());
149 let mut is_current_window_ascii = true;
150 let mut current_window_start = 0;
151- #[cfg(feature = "text-processing")]
152 {
153 let graphemes = value.graphemes_indices();
154 for (idx, g) in graphemes {
155 @@ -81,63 +79,6 @@ pub fn encode_header(value: &str) -> String {
156 }
157 }
158 }
159- #[cfg(not(feature = "text-processing"))]
160- {
161- /* [ref:VERIFY] [ref:TODO]: test this. If it works as fine as the one above, there's no need to
162- * keep the above implementation. */
163- for (i, g) in value.char_indices() {
164- match (g.is_ascii(), is_current_window_ascii) {
165- (true, true) => {
166- ret.push(g);
167- }
168- (true, false) => {
169- /* If !g.is_whitespace()
170- *
171- * Whitespaces inside encoded tokens must be greedily taken,
172- * instead of splitting each non-ascii word into separate encoded tokens. */
173- if !g.is_whitespace() && value.is_char_boundary(i) {
174- ret.push_str(&format!(
175- "=?UTF-8?B?{}?=",
176- BASE64_MIME
177- .encode(value[current_window_start..i].as_bytes())
178- .trim()
179- ));
180- if i != value.len() - 1 {
181- ret.push(' ');
182- }
183- is_current_window_ascii = true;
184- current_window_start = i;
185- ret.push(g);
186- }
187- }
188- (false, true) => {
189- current_window_start = i;
190- is_current_window_ascii = false;
191- }
192- /* RFC2047 recommends:
193- * 'While there is no limit to the length of a multiple-line header field, each
194- * line of a header field that contains one or more
195- * 'encoded-word's is limited to 76 characters.'
196- * This is a rough compliance.
197- */
198- (false, false)
199- if value.is_char_boundary(i) && value[current_window_start..i].len() > 76 =>
200- {
201- ret.push_str(&format!(
202- "=?UTF-8?B?{}?=",
203- BASE64_MIME
204- .encode(value[current_window_start..i].as_bytes())
205- .trim()
206- ));
207- if i != value.len() - 1 {
208- ret.push(' ');
209- }
210- current_window_start = i;
211- }
212- (false, false) => {}
213- }
214- }
215- }
216 /* If the last part of the header value is encoded, it won't be pushed inside
217 * the previous for block */
218 if !is_current_window_ascii {
219 diff --git a/melib/src/lib.rs b/melib/src/lib.rs
220index 2554134..d27731e 100644
221--- a/melib/src/lib.rs
222+++ b/melib/src/lib.rs
223 @@ -132,7 +132,6 @@ pub mod dbg {
224 }
225 }
226
227- #[cfg(feature = "text-processing")]
228 pub mod text;
229
230 pub use utils::{
231 diff --git a/melib/src/text/grapheme_clusters.rs b/melib/src/text/grapheme_clusters.rs
232index 670b25e..324d7fc 100644
233--- a/melib/src/text/grapheme_clusters.rs
234+++ b/melib/src/text/grapheme_clusters.rs
235 @@ -29,12 +29,12 @@
236
237 */
238
239+ use unicode_segmentation::UnicodeSegmentation;
240+
241 use super::{
242 types::Reflow,
243 wcwidth::{wcwidth, CodePointsIter},
244 };
245- extern crate unicode_segmentation;
246- use self::unicode_segmentation::UnicodeSegmentation;
247
248 pub trait TextProcessing: UnicodeSegmentation + CodePointsIter {
249 fn split_graphemes(&self) -> Vec<&str> {
250 diff --git a/melib/src/text/line_break.rs b/melib/src/text/line_break.rs
251index e4e7e26..5a20864 100644
252--- a/melib/src/text/line_break.rs
253+++ b/melib/src/text/line_break.rs
254 @@ -19,12 +19,11 @@
255 * along with meli. If not, see <http://www.gnu.org/licenses/>.
256 */
257
258- extern crate unicode_segmentation;
259 use std::{cmp::Ordering, collections::VecDeque, iter::Peekable, str::FromStr};
260
261+ use unicode_segmentation::UnicodeSegmentation;
262 use LineBreakClass::*;
263
264- use self::unicode_segmentation::UnicodeSegmentation;
265 use super::{
266 grapheme_clusters::TextProcessing,
267 tables::LINE_BREAK_RULES,
268 diff --git a/melib/src/text/mod.rs b/melib/src/text/mod.rs
269index 4348aff..1cc1a18 100644
270--- a/melib/src/text/mod.rs
271+++ b/melib/src/text/mod.rs
272 @@ -19,6 +19,8 @@
273 * along with meli. If not, see <http://www.gnu.org/licenses/>.
274 */
275
276+ use unicode_segmentation::UnicodeSegmentation;
277+
278 pub mod grapheme_clusters;
279 pub mod line_break;
280 pub mod search;
281 @@ -43,8 +45,6 @@ impl Truncate for &str {
282 return;
283 }
284
285- extern crate unicode_segmentation;
286- use unicode_segmentation::UnicodeSegmentation;
287 if let Some((last, _)) = UnicodeSegmentation::grapheme_indices(*self, true)
288 .take(new_len)
289 .last()
290 @@ -58,8 +58,6 @@ impl Truncate for &str {
291 return self;
292 }
293
294- extern crate unicode_segmentation;
295- use unicode_segmentation::UnicodeSegmentation;
296 if let Some((last, _)) = UnicodeSegmentation::grapheme_indices(*self, true)
297 .take(new_len)
298 .last()
299 @@ -75,8 +73,6 @@ impl Truncate for &str {
300 return "";
301 }
302
303- extern crate unicode_segmentation;
304- use unicode_segmentation::UnicodeSegmentation;
305 if let Some((first, _)) = UnicodeSegmentation::grapheme_indices(*self, true).nth(skip_len) {
306 &self[first..]
307 } else {
308 @@ -90,8 +86,6 @@ impl Truncate for &str {
309 return;
310 }
311
312- extern crate unicode_segmentation;
313- use unicode_segmentation::UnicodeSegmentation;
314 if let Some((first, _)) = UnicodeSegmentation::grapheme_indices(*self, true).nth(skip_len) {
315 *self = &self[first..];
316 }
317 @@ -104,8 +98,6 @@ impl Truncate for String {
318 return;
319 }
320
321- extern crate unicode_segmentation;
322- use unicode_segmentation::UnicodeSegmentation;
323 if let Some((last, _)) = UnicodeSegmentation::grapheme_indices(self.as_str(), true)
324 .take(new_len)
325 .last()
326 @@ -119,8 +111,6 @@ impl Truncate for String {
327 return self;
328 }
329
330- extern crate unicode_segmentation;
331- use unicode_segmentation::UnicodeSegmentation;
332 if let Some((last, _)) = UnicodeSegmentation::grapheme_indices(self.as_str(), true)
333 .take(new_len)
334 .last()
335 @@ -136,8 +126,6 @@ impl Truncate for String {
336 return "";
337 }
338
339- extern crate unicode_segmentation;
340- use unicode_segmentation::UnicodeSegmentation;
341 if let Some((first, _)) =
342 UnicodeSegmentation::grapheme_indices(self.as_str(), true).nth(skip_len)
343 {
344 @@ -153,8 +141,6 @@ impl Truncate for String {
345 return;
346 }
347
348- extern crate unicode_segmentation;
349- use unicode_segmentation::UnicodeSegmentation;
350 if let Some((first, _)) =
351 UnicodeSegmentation::grapheme_indices(self.as_str(), true).nth(skip_len)
352 {
353 diff --git a/melib/src/thread.rs b/melib/src/thread.rs
354index e917e26..ebd0b2f 100644
355--- a/melib/src/thread.rs
356+++ b/melib/src/thread.rs
357 @@ -52,7 +52,6 @@ pub use iterators::*;
358 use smallvec::SmallVec;
359 use uuid::Uuid;
360
361- #[cfg(feature = "text-processing")]
362 use crate::text::grapheme_clusters::*;
363
364 type Envelopes = Arc<RwLock<HashMap<EnvelopeHash, Envelope>>>;
365 @@ -1223,16 +1222,11 @@ impl Threads {
366 }
367 let ma = &envelopes[&a.unwrap()];
368 let mb = &envelopes[&b.unwrap()];
369- #[cfg(feature = "text-processing")]
370 {
371 ma.subject()
372 .split_graphemes()
373 .cmp(&mb.subject().split_graphemes())
374 }
375- #[cfg(not(feature = "text-processing"))]
376- {
377- ma.subject().cmp(&mb.subject())
378- }
379 }
380 (SortField::Subject, SortOrder::Asc) => {
381 let a = &self.thread_nodes[&self.thread_ref(*a).root()].message();
382 @@ -1252,18 +1246,12 @@ impl Threads {
383 }
384 let ma = &envelopes[&a.unwrap()];
385 let mb = &envelopes[&b.unwrap()];
386- #[cfg(feature = "text-processing")]
387 {
388 mb.subject()
389 .as_ref()
390 .split_graphemes()
391 .cmp(&ma.subject().split_graphemes())
392 }
393-
394- #[cfg(not(feature = "text-processing"))]
395- {
396- mb.subject().as_ref().cmp(&ma.subject())
397- }
398 }
399 });
400 }
401 @@ -1303,16 +1291,11 @@ impl Threads {
402 }
403 let ma = &envelopes[&a.unwrap()];
404 let mb = &envelopes[&b.unwrap()];
405- #[cfg(feature = "text-processing")]
406 {
407 ma.subject()
408 .split_graphemes()
409 .cmp(&mb.subject().split_graphemes())
410 }
411- #[cfg(not(feature = "text-processing"))]
412- {
413- ma.subject().cmp(&mb.subject())
414- }
415 }
416 (SortField::Subject, SortOrder::Asc) => {
417 let a = &self.thread_nodes[a].message();
418 @@ -1332,18 +1315,12 @@ impl Threads {
419 }
420 let ma = &envelopes[&a.unwrap()];
421 let mb = &envelopes[&b.unwrap()];
422- #[cfg(feature = "text-processing")]
423 {
424 mb.subject()
425 .as_ref()
426 .split_graphemes()
427 .cmp(&ma.subject().split_graphemes())
428 }
429-
430- #[cfg(not(feature = "text-processing"))]
431- {
432- mb.subject().as_ref().cmp(&ma.subject())
433- }
434 }
435 });
436 }
437 @@ -1379,16 +1356,11 @@ impl Threads {
438 }
439 let ma = &envelopes[&a.unwrap()];
440 let mb = &envelopes[&b.unwrap()];
441- #[cfg(feature = "text-processing")]
442 {
443 ma.subject()
444 .split_graphemes()
445 .cmp(&mb.subject().split_graphemes())
446 }
447- #[cfg(not(feature = "text-processing"))]
448- {
449- ma.subject().cmp(&mb.subject())
450- }
451 }
452 (SortField::Subject, SortOrder::Asc) => {
453 let a = &self.thread_nodes[a].message();
454 @@ -1408,18 +1380,12 @@ impl Threads {
455 }
456 let ma = &envelopes[&a.unwrap()];
457 let mb = &envelopes[&b.unwrap()];
458- #[cfg(feature = "text-processing")]
459 {
460 mb.subject()
461 .as_ref()
462 .split_graphemes()
463 .cmp(&ma.subject().split_graphemes())
464 }
465-
466- #[cfg(not(feature = "text-processing"))]
467- {
468- mb.subject().as_ref().cmp(&ma.subject())
469- }
470 }
471 });
472 }
473 diff --git a/tools/Cargo.toml b/tools/Cargo.toml
474index e5621c9..5cdf3db 100644
475--- a/tools/Cargo.toml
476+++ b/tools/Cargo.toml
477 @@ -40,7 +40,7 @@ required-features = ["melib/imap"]
478 [dependencies]
479 crossbeam = { version = "^0.8" }
480 meli = { path = "../meli", version = "0.8" }
481- melib = { path = "../melib", version = "0.8", features = ["debug-tracing", "unicode-algorithms"] }
482+ melib = { path = "../melib", version = "0.8", features = ["debug-tracing" ] }
483 nix = { version = "^0.24", default-features = false }
484 signal-hook = { version = "^0.3", default-features = false, features = ["iterator"] }
485 signal-hook-registry = { version = "1.2.0", default-features = false }