add unicode-linebreak
parent
40b87a4bb9
commit
342422e221
@ -0,0 +1,48 @@
|
||||
MIT License
|
||||
|
||||
unicode-linebreak is © 2019 Matthew Butterick
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
|
||||
---
|
||||
|
||||
unicode-linebreak contains substantial portions of the following software:
|
||||
|
||||
[linebreak](https://github.com/foliojs/linebreak)
|
||||
|
||||
MIT LICENSE
|
||||
Copyright (c) 2014-16 Devon Govett
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
@ -0,0 +1,4 @@
|
||||
# unicode-linebreak
|
||||
Racket implementation of Unicode linebreaking algorithm
|
||||
|
||||
Based on https://github.com/foliojs/linebreak
|
@ -0,0 +1,4 @@
|
||||
#lang info
|
||||
(define collection 'multi)
|
||||
(define version "0.0")
|
||||
(define test-omit-paths 'all)
|
@ -0,0 +1,34 @@
|
||||
#lang debug racket/base
|
||||
(require (for-syntax racket/base) racket/match racket/string)
|
||||
|
||||
(module+ reader
|
||||
(require syntax/strip-context)
|
||||
(provide (rename-out [rs read-syntax]))
|
||||
(define (rs name ip)
|
||||
(define lines
|
||||
(for*/list ([line (in-lines ip)]
|
||||
[str (in-value (string-trim (string-trim line #px"#.*" #:left? #false)))]
|
||||
#:when (non-empty-string? str))
|
||||
(match-define (list range tag) (string-split str ";"))
|
||||
(list (map (λ (str) (string->number str 16)) (string-split range ".."))
|
||||
(string->symbol tag))))
|
||||
(strip-context
|
||||
(with-syntax ([LINES lines])
|
||||
#'(module _ "classes-prep.rkt"
|
||||
. LINES)))))
|
||||
|
||||
(define-syntax (make-cond stx)
|
||||
(syntax-case stx ()
|
||||
[(_ ID VAL) #'(eq? ID VAL)] ;; I believe `eq?` is OK because a codepoint is a fixnum
|
||||
[(_ ID LVAL RVAL) #'(<= LVAL ID RVAL)]))
|
||||
|
||||
(provide (rename-out [mb #%module-begin])
|
||||
(except-out (all-from-out racket/base) #%module-begin))
|
||||
(define-syntax (mb stx)
|
||||
(syntax-case stx ()
|
||||
[(_ (VALS RES) ...)
|
||||
(with-syntax ([F (datum->syntax stx 'f)])
|
||||
#'(#%module-begin
|
||||
(provide F)
|
||||
(define (F x)
|
||||
(cond [(make-cond x . VALS) 'RES] ...))))]))
|
@ -0,0 +1,46 @@
|
||||
#lang racket/base
|
||||
(provide (all-defined-out))
|
||||
|
||||
;; The following break classes are handled by the pair table
|
||||
(define OP 0) ;; Opening punctuation
|
||||
(define CL 1) ;; Closing punctuation
|
||||
(define CP 2) ;; Closing parenthesis
|
||||
(define QU 3) ;; Ambiguous quotation
|
||||
(define GL 4) ;; Glue
|
||||
(define NS 5) ;; Non-starters
|
||||
(define EX 6) ;; Exclamation/Interrogation
|
||||
(define SY 7) ;; Symbols allowing break after
|
||||
(define IS 8) ;; Infix separator
|
||||
(define PR 9) ;; Prefix
|
||||
(define PO 10) ;; Postfix
|
||||
(define NU 11) ;; Numeric
|
||||
(define AL 12) ;; Alphabetic
|
||||
(define HL 13) ;; Hebrew Letter
|
||||
(define ID 14) ;; Ideographic
|
||||
(define IN 15) ;; Inseparable characters
|
||||
(define HY 16) ;; Hyphen
|
||||
(define BA 17) ;; Break after
|
||||
(define BB 18) ;; Break before
|
||||
(define B2 19) ;; Break on either side (but not pair)
|
||||
(define ZW 20) ;; Zero-width space
|
||||
(define CM 21) ;; Combining marks
|
||||
(define WJ 22) ;; Word joiner
|
||||
(define H2 23) ;; Hangul LV
|
||||
(define H3 24) ;; Hangul LVT
|
||||
(define JL 25) ;; Hangul L Jamo
|
||||
(define JV 26) ;; Hangul V Jamo
|
||||
(define JT 27) ;; Hangul T Jamo
|
||||
(define RI 28) ;; Regional Indicator
|
||||
|
||||
;; The following break classes are not handled by the pair table
|
||||
(define AI 29) ;; Ambiguous (Alphabetic or Ideograph)
|
||||
(define BK 30) ;; Break (mandatory)
|
||||
(define CB 31) ;; Contingent break
|
||||
(define CJ 32) ;; Conditional Japanese Starter
|
||||
(define CR 33) ;; Carriage return
|
||||
(define LF 34) ;; Line feed
|
||||
(define NL 35) ;; Next line
|
||||
(define SA 36) ;; South-East Asian
|
||||
(define SG 37) ;; Surrogates
|
||||
(define SP 38) ;; Space
|
||||
(define XX 39) ;; Unknown
|
@ -0,0 +1,76 @@
|
||||
#lang at-exp debug racket/base
|
||||
(require racket/string racket/match)
|
||||
(provide (all-defined-out))
|
||||
|
||||
(define DI_BRK 0) ; Direct break opportunity
|
||||
(define IN_BRK 1) ; Indirect break opportunity
|
||||
(define CI_BRK 2) ; Indirect break opportunity for combining marks
|
||||
(define CP_BRK 3) ; Prohibited break for combining marks
|
||||
(define PR_BRK 4) ; Prohibited break
|
||||
|
||||
(define/match (tok->val tok)
|
||||
[("^") PR_BRK]
|
||||
[("@") CP_BRK]
|
||||
[("_") DI_BRK]
|
||||
[("%") IN_BRK]
|
||||
[("#") CI_BRK])
|
||||
|
||||
(define/match (make-pair-table . strs)
|
||||
[((cons header recs))
|
||||
(for/list ([rec (in-list recs)]
|
||||
#:unless (regexp-match #px"^\\s*$" rec))
|
||||
(for/list ([tok (in-list (string-split (string-trim rec #px"\\w+")))])
|
||||
(tok->val tok)))])
|
||||
|
||||
(define/match (make-pair-hash . strs)
|
||||
[((list* header _ recs))
|
||||
(define tok-pat #px"\\w+")
|
||||
(for*/hash ([right (in-list (map string->symbol (string-split header)))]
|
||||
[rec (in-list recs)]
|
||||
#:unless (regexp-match #px"^\\s*$" rec)
|
||||
[left (in-value (string->symbol (car (regexp-match tok-pat rec))))]
|
||||
[tok (in-list (string-split (string-trim rec tok-pat)))])
|
||||
(values (cons left right) (tok->val tok)))])
|
||||
|
||||
;; table copied from https://www.unicode.org/reports/tr14/tr14-37.html#Table2
|
||||
(define table-strs
|
||||
@list|{
|
||||
OP CL CP QU GL NS EX SY IS PR PO NU AL HL ID IN HY BA BB B2 ZW CM WJ H2 H3 JL JV JT RI EB EM ZWJ
|
||||
OP ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ @ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^
|
||||
CL _ ^ ^ % % ^ ^ ^ ^ % % _ _ _ _ _ % % _ _ ^ # ^ _ _ _ _ _ _ _ _ %
|
||||
CP _ ^ ^ % % ^ ^ ^ ^ % % % % % _ _ % % _ _ ^ # ^ _ _ _ _ _ _ _ _ %
|
||||
QU ^ ^ ^ % % % ^ ^ ^ % % % % % % % % % % % ^ # ^ % % % % % % % % %
|
||||
GL % ^ ^ % % % ^ ^ ^ % % % % % % % % % % % ^ # ^ % % % % % % % % %
|
||||
NS _ ^ ^ % % % ^ ^ ^ _ _ _ _ _ _ _ % % _ _ ^ # ^ _ _ _ _ _ _ _ _ %
|
||||
EX _ ^ ^ % % % ^ ^ ^ _ _ _ _ _ _ % % % _ _ ^ # ^ _ _ _ _ _ _ _ _ %
|
||||
SY _ ^ ^ % % % ^ ^ ^ _ _ % _ % _ _ % % _ _ ^ # ^ _ _ _ _ _ _ _ _ %
|
||||
IS _ ^ ^ % % % ^ ^ ^ _ _ % % % _ _ % % _ _ ^ # ^ _ _ _ _ _ _ _ _ %
|
||||
PR % ^ ^ % % % ^ ^ ^ _ _ % % % % _ % % _ _ ^ # ^ % % % % % _ % % %
|
||||
PO % ^ ^ % % % ^ ^ ^ _ _ % % % _ _ % % _ _ ^ # ^ _ _ _ _ _ _ _ _ %
|
||||
NU % ^ ^ % % % ^ ^ ^ % % % % % _ % % % _ _ ^ # ^ _ _ _ _ _ _ _ _ %
|
||||
AL % ^ ^ % % % ^ ^ ^ % % % % % _ % % % _ _ ^ # ^ _ _ _ _ _ _ _ _ %
|
||||
HL % ^ ^ % % % ^ ^ ^ % % % % % _ % % % _ _ ^ # ^ _ _ _ _ _ _ _ _ %
|
||||
ID _ ^ ^ % % % ^ ^ ^ _ % _ _ _ _ % % % _ _ ^ # ^ _ _ _ _ _ _ _ _ %
|
||||
IN _ ^ ^ % % % ^ ^ ^ _ _ _ _ _ _ % % % _ _ ^ # ^ _ _ _ _ _ _ _ _ %
|
||||
HY _ ^ ^ % _ % ^ ^ ^ _ _ % _ _ _ _ % % _ _ ^ # ^ _ _ _ _ _ _ _ _ %
|
||||
BA _ ^ ^ % _ % ^ ^ ^ _ _ _ _ _ _ _ % % _ _ ^ # ^ _ _ _ _ _ _ _ _ %
|
||||
BB % ^ ^ % % % ^ ^ ^ % % % % % % % % % % % ^ # ^ % % % % % % % % %
|
||||
B2 _ ^ ^ % % % ^ ^ ^ _ _ _ _ _ _ _ % % _ ^ ^ # ^ _ _ _ _ _ _ _ _ %
|
||||
ZW _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ ^ _ _ _ _ _ _ _ _ _ _ _
|
||||
CM % ^ ^ % % % ^ ^ ^ % % % % % _ % % % _ _ ^ # ^ _ _ _ _ _ _ _ _ %
|
||||
WJ % ^ ^ % % % ^ ^ ^ % % % % % % % % % % % ^ # ^ % % % % % % % % %
|
||||
H2 _ ^ ^ % % % ^ ^ ^ _ % _ _ _ _ % % % _ _ ^ # ^ _ _ _ % % _ _ _ %
|
||||
H3 _ ^ ^ % % % ^ ^ ^ _ % _ _ _ _ % % % _ _ ^ # ^ _ _ _ _ % _ _ _ %
|
||||
JL _ ^ ^ % % % ^ ^ ^ _ % _ _ _ _ % % % _ _ ^ # ^ % % % % _ _ _ _ %
|
||||
JV _ ^ ^ % % % ^ ^ ^ _ % _ _ _ _ % % % _ _ ^ # ^ _ _ _ % % _ _ _ %
|
||||
JT _ ^ ^ % % % ^ ^ ^ _ % _ _ _ _ % % % _ _ ^ # ^ _ _ _ _ % _ _ _ %
|
||||
RI _ ^ ^ % % % ^ ^ ^ _ _ _ _ _ _ _ % % _ _ ^ # ^ _ _ _ _ _ % _ _ %
|
||||
EB _ ^ ^ % % % ^ ^ ^ _ % _ _ _ _ % % % _ _ ^ # ^ _ _ _ _ _ _ _ % %
|
||||
EM _ ^ ^ % % % ^ ^ ^ _ % _ _ _ _ % % % _ _ ^ # ^ _ _ _ _ _ _ _ _ %
|
||||
ZWJ _ ^ ^ % % % ^ ^ ^ _ _ _ _ _ % _ % % _ _ ^ # ^ _ _ _ _ _ _ % % %
|
||||
}|)
|
||||
(define pair-table (apply make-pair-table table-strs))
|
||||
(define pair-hash (apply make-pair-hash table-strs))
|
||||
|
||||
(module+ main
|
||||
pair-table)
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue