add unicode-linebreak
parent
40b87a4bb9
commit
342422e221
@ -0,0 +1,48 @@
|
|||||||
|
MIT License
|
||||||
|
|
||||||
|
unicode-linebreak is © 2019 Matthew Butterick
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
unicode-linebreak contains substantial portions of the following software:
|
||||||
|
|
||||||
|
[linebreak](https://github.com/foliojs/linebreak)
|
||||||
|
|
||||||
|
MIT LICENSE
|
||||||
|
Copyright (c) 2014-16 Devon Govett
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE.
|
@ -0,0 +1,4 @@
|
|||||||
|
# unicode-linebreak
|
||||||
|
Racket implementation of Unicode linebreaking algorithm
|
||||||
|
|
||||||
|
Based on https://github.com/foliojs/linebreak
|
@ -0,0 +1,4 @@
|
|||||||
|
#lang info
|
||||||
|
(define collection 'multi)
|
||||||
|
(define version "0.0")
|
||||||
|
(define test-omit-paths 'all)
|
@ -0,0 +1,34 @@
|
|||||||
|
#lang debug racket/base
|
||||||
|
(require (for-syntax racket/base) racket/match racket/string)
|
||||||
|
|
||||||
|
(module+ reader
|
||||||
|
(require syntax/strip-context)
|
||||||
|
(provide (rename-out [rs read-syntax]))
|
||||||
|
(define (rs name ip)
|
||||||
|
(define lines
|
||||||
|
(for*/list ([line (in-lines ip)]
|
||||||
|
[str (in-value (string-trim (string-trim line #px"#.*" #:left? #false)))]
|
||||||
|
#:when (non-empty-string? str))
|
||||||
|
(match-define (list range tag) (string-split str ";"))
|
||||||
|
(list (map (λ (str) (string->number str 16)) (string-split range ".."))
|
||||||
|
(string->symbol tag))))
|
||||||
|
(strip-context
|
||||||
|
(with-syntax ([LINES lines])
|
||||||
|
#'(module _ "classes-prep.rkt"
|
||||||
|
. LINES)))))
|
||||||
|
|
||||||
|
(define-syntax (make-cond stx)
|
||||||
|
(syntax-case stx ()
|
||||||
|
[(_ ID VAL) #'(eq? ID VAL)] ;; I believe `eq?` is OK because a codepoint is a fixnum
|
||||||
|
[(_ ID LVAL RVAL) #'(<= LVAL ID RVAL)]))
|
||||||
|
|
||||||
|
(provide (rename-out [mb #%module-begin])
|
||||||
|
(except-out (all-from-out racket/base) #%module-begin))
|
||||||
|
(define-syntax (mb stx)
|
||||||
|
(syntax-case stx ()
|
||||||
|
[(_ (VALS RES) ...)
|
||||||
|
(with-syntax ([F (datum->syntax stx 'f)])
|
||||||
|
#'(#%module-begin
|
||||||
|
(provide F)
|
||||||
|
(define (F x)
|
||||||
|
(cond [(make-cond x . VALS) 'RES] ...))))]))
|
@ -0,0 +1,46 @@
|
|||||||
|
#lang racket/base
|
||||||
|
(provide (all-defined-out))
|
||||||
|
|
||||||
|
;; The following break classes are handled by the pair table
|
||||||
|
(define OP 0) ;; Opening punctuation
|
||||||
|
(define CL 1) ;; Closing punctuation
|
||||||
|
(define CP 2) ;; Closing parenthesis
|
||||||
|
(define QU 3) ;; Ambiguous quotation
|
||||||
|
(define GL 4) ;; Glue
|
||||||
|
(define NS 5) ;; Non-starters
|
||||||
|
(define EX 6) ;; Exclamation/Interrogation
|
||||||
|
(define SY 7) ;; Symbols allowing break after
|
||||||
|
(define IS 8) ;; Infix separator
|
||||||
|
(define PR 9) ;; Prefix
|
||||||
|
(define PO 10) ;; Postfix
|
||||||
|
(define NU 11) ;; Numeric
|
||||||
|
(define AL 12) ;; Alphabetic
|
||||||
|
(define HL 13) ;; Hebrew Letter
|
||||||
|
(define ID 14) ;; Ideographic
|
||||||
|
(define IN 15) ;; Inseparable characters
|
||||||
|
(define HY 16) ;; Hyphen
|
||||||
|
(define BA 17) ;; Break after
|
||||||
|
(define BB 18) ;; Break before
|
||||||
|
(define B2 19) ;; Break on either side (but not pair)
|
||||||
|
(define ZW 20) ;; Zero-width space
|
||||||
|
(define CM 21) ;; Combining marks
|
||||||
|
(define WJ 22) ;; Word joiner
|
||||||
|
(define H2 23) ;; Hangul LV
|
||||||
|
(define H3 24) ;; Hangul LVT
|
||||||
|
(define JL 25) ;; Hangul L Jamo
|
||||||
|
(define JV 26) ;; Hangul V Jamo
|
||||||
|
(define JT 27) ;; Hangul T Jamo
|
||||||
|
(define RI 28) ;; Regional Indicator
|
||||||
|
|
||||||
|
;; The following break classes are not handled by the pair table
|
||||||
|
(define AI 29) ;; Ambiguous (Alphabetic or Ideograph)
|
||||||
|
(define BK 30) ;; Break (mandatory)
|
||||||
|
(define CB 31) ;; Contingent break
|
||||||
|
(define CJ 32) ;; Conditional Japanese Starter
|
||||||
|
(define CR 33) ;; Carriage return
|
||||||
|
(define LF 34) ;; Line feed
|
||||||
|
(define NL 35) ;; Next line
|
||||||
|
(define SA 36) ;; South-East Asian
|
||||||
|
(define SG 37) ;; Surrogates
|
||||||
|
(define SP 38) ;; Space
|
||||||
|
(define XX 39) ;; Unknown
|
@ -0,0 +1,76 @@
|
|||||||
|
#lang at-exp debug racket/base
|
||||||
|
(require racket/string racket/match)
|
||||||
|
(provide (all-defined-out))
|
||||||
|
|
||||||
|
(define DI_BRK 0) ; Direct break opportunity
|
||||||
|
(define IN_BRK 1) ; Indirect break opportunity
|
||||||
|
(define CI_BRK 2) ; Indirect break opportunity for combining marks
|
||||||
|
(define CP_BRK 3) ; Prohibited break for combining marks
|
||||||
|
(define PR_BRK 4) ; Prohibited break
|
||||||
|
|
||||||
|
(define/match (tok->val tok)
|
||||||
|
[("^") PR_BRK]
|
||||||
|
[("@") CP_BRK]
|
||||||
|
[("_") DI_BRK]
|
||||||
|
[("%") IN_BRK]
|
||||||
|
[("#") CI_BRK])
|
||||||
|
|
||||||
|
(define/match (make-pair-table . strs)
|
||||||
|
[((cons header recs))
|
||||||
|
(for/list ([rec (in-list recs)]
|
||||||
|
#:unless (regexp-match #px"^\\s*$" rec))
|
||||||
|
(for/list ([tok (in-list (string-split (string-trim rec #px"\\w+")))])
|
||||||
|
(tok->val tok)))])
|
||||||
|
|
||||||
|
(define/match (make-pair-hash . strs)
|
||||||
|
[((list* header _ recs))
|
||||||
|
(define tok-pat #px"\\w+")
|
||||||
|
(for*/hash ([right (in-list (map string->symbol (string-split header)))]
|
||||||
|
[rec (in-list recs)]
|
||||||
|
#:unless (regexp-match #px"^\\s*$" rec)
|
||||||
|
[left (in-value (string->symbol (car (regexp-match tok-pat rec))))]
|
||||||
|
[tok (in-list (string-split (string-trim rec tok-pat)))])
|
||||||
|
(values (cons left right) (tok->val tok)))])
|
||||||
|
|
||||||
|
;; table copied from https://www.unicode.org/reports/tr14/tr14-37.html#Table2
|
||||||
|
(define table-strs
|
||||||
|
@list|{
|
||||||
|
OP CL CP QU GL NS EX SY IS PR PO NU AL HL ID IN HY BA BB B2 ZW CM WJ H2 H3 JL JV JT RI EB EM ZWJ
|
||||||
|
OP ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ @ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^
|
||||||
|
CL _ ^ ^ % % ^ ^ ^ ^ % % _ _ _ _ _ % % _ _ ^ # ^ _ _ _ _ _ _ _ _ %
|
||||||
|
CP _ ^ ^ % % ^ ^ ^ ^ % % % % % _ _ % % _ _ ^ # ^ _ _ _ _ _ _ _ _ %
|
||||||
|
QU ^ ^ ^ % % % ^ ^ ^ % % % % % % % % % % % ^ # ^ % % % % % % % % %
|
||||||
|
GL % ^ ^ % % % ^ ^ ^ % % % % % % % % % % % ^ # ^ % % % % % % % % %
|
||||||
|
NS _ ^ ^ % % % ^ ^ ^ _ _ _ _ _ _ _ % % _ _ ^ # ^ _ _ _ _ _ _ _ _ %
|
||||||
|
EX _ ^ ^ % % % ^ ^ ^ _ _ _ _ _ _ % % % _ _ ^ # ^ _ _ _ _ _ _ _ _ %
|
||||||
|
SY _ ^ ^ % % % ^ ^ ^ _ _ % _ % _ _ % % _ _ ^ # ^ _ _ _ _ _ _ _ _ %
|
||||||
|
IS _ ^ ^ % % % ^ ^ ^ _ _ % % % _ _ % % _ _ ^ # ^ _ _ _ _ _ _ _ _ %
|
||||||
|
PR % ^ ^ % % % ^ ^ ^ _ _ % % % % _ % % _ _ ^ # ^ % % % % % _ % % %
|
||||||
|
PO % ^ ^ % % % ^ ^ ^ _ _ % % % _ _ % % _ _ ^ # ^ _ _ _ _ _ _ _ _ %
|
||||||
|
NU % ^ ^ % % % ^ ^ ^ % % % % % _ % % % _ _ ^ # ^ _ _ _ _ _ _ _ _ %
|
||||||
|
AL % ^ ^ % % % ^ ^ ^ % % % % % _ % % % _ _ ^ # ^ _ _ _ _ _ _ _ _ %
|
||||||
|
HL % ^ ^ % % % ^ ^ ^ % % % % % _ % % % _ _ ^ # ^ _ _ _ _ _ _ _ _ %
|
||||||
|
ID _ ^ ^ % % % ^ ^ ^ _ % _ _ _ _ % % % _ _ ^ # ^ _ _ _ _ _ _ _ _ %
|
||||||
|
IN _ ^ ^ % % % ^ ^ ^ _ _ _ _ _ _ % % % _ _ ^ # ^ _ _ _ _ _ _ _ _ %
|
||||||
|
HY _ ^ ^ % _ % ^ ^ ^ _ _ % _ _ _ _ % % _ _ ^ # ^ _ _ _ _ _ _ _ _ %
|
||||||
|
BA _ ^ ^ % _ % ^ ^ ^ _ _ _ _ _ _ _ % % _ _ ^ # ^ _ _ _ _ _ _ _ _ %
|
||||||
|
BB % ^ ^ % % % ^ ^ ^ % % % % % % % % % % % ^ # ^ % % % % % % % % %
|
||||||
|
B2 _ ^ ^ % % % ^ ^ ^ _ _ _ _ _ _ _ % % _ ^ ^ # ^ _ _ _ _ _ _ _ _ %
|
||||||
|
ZW _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ ^ _ _ _ _ _ _ _ _ _ _ _
|
||||||
|
CM % ^ ^ % % % ^ ^ ^ % % % % % _ % % % _ _ ^ # ^ _ _ _ _ _ _ _ _ %
|
||||||
|
WJ % ^ ^ % % % ^ ^ ^ % % % % % % % % % % % ^ # ^ % % % % % % % % %
|
||||||
|
H2 _ ^ ^ % % % ^ ^ ^ _ % _ _ _ _ % % % _ _ ^ # ^ _ _ _ % % _ _ _ %
|
||||||
|
H3 _ ^ ^ % % % ^ ^ ^ _ % _ _ _ _ % % % _ _ ^ # ^ _ _ _ _ % _ _ _ %
|
||||||
|
JL _ ^ ^ % % % ^ ^ ^ _ % _ _ _ _ % % % _ _ ^ # ^ % % % % _ _ _ _ %
|
||||||
|
JV _ ^ ^ % % % ^ ^ ^ _ % _ _ _ _ % % % _ _ ^ # ^ _ _ _ % % _ _ _ %
|
||||||
|
JT _ ^ ^ % % % ^ ^ ^ _ % _ _ _ _ % % % _ _ ^ # ^ _ _ _ _ % _ _ _ %
|
||||||
|
RI _ ^ ^ % % % ^ ^ ^ _ _ _ _ _ _ _ % % _ _ ^ # ^ _ _ _ _ _ % _ _ %
|
||||||
|
EB _ ^ ^ % % % ^ ^ ^ _ % _ _ _ _ % % % _ _ ^ # ^ _ _ _ _ _ _ _ % %
|
||||||
|
EM _ ^ ^ % % % ^ ^ ^ _ % _ _ _ _ % % % _ _ ^ # ^ _ _ _ _ _ _ _ _ %
|
||||||
|
ZWJ _ ^ ^ % % % ^ ^ ^ _ _ _ _ _ % _ % % _ _ ^ # ^ _ _ _ _ _ _ % % %
|
||||||
|
}|)
|
||||||
|
(define pair-table (apply make-pair-table table-strs))
|
||||||
|
(define pair-hash (apply make-pair-hash table-strs))
|
||||||
|
|
||||||
|
(module+ main
|
||||||
|
pair-table)
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue