add unicode-linebreak

main
Matthew Butterick 3 months ago
parent 40b87a4bb9
commit 342422e221
  1. 48
      unicode-linebreak/LICENSE.md
  2. 4
      unicode-linebreak/README.md
  3. 4
      unicode-linebreak/info.rkt
  4. 34
      unicode-linebreak/unicode-linebreak/classes-prep.rkt
  5. 46
      unicode-linebreak/unicode-linebreak/classes.rkt
  6. 76
      unicode-linebreak/unicode-linebreak/pairs.rkt
  7. 3059
      unicode-linebreak/unicode-linebreak/unicode-classes.rkt

@ -0,0 +1,48 @@
MIT License
unicode-linebreak is © 2019 Matthew Butterick
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
---
unicode-linebreak contains substantial portions of the following software:
[linebreak](https://github.com/foliojs/linebreak)
MIT LICENSE
Copyright (c) 2014-16 Devon Govett
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

@ -0,0 +1,4 @@
# unicode-linebreak
Racket implementation of Unicode linebreaking algorithm
Based on https://github.com/foliojs/linebreak

@ -0,0 +1,4 @@
#lang info
(define collection 'multi)
(define version "0.0")
(define test-omit-paths 'all)

@ -0,0 +1,34 @@
#lang debug racket/base
(require (for-syntax racket/base) racket/match racket/string)
(module+ reader
(require syntax/strip-context)
(provide (rename-out [rs read-syntax]))
(define (rs name ip)
(define lines
(for*/list ([line (in-lines ip)]
[str (in-value (string-trim (string-trim line #px"#.*" #:left? #false)))]
#:when (non-empty-string? str))
(match-define (list range tag) (string-split str ";"))
(list (map (λ (str) (string->number str 16)) (string-split range ".."))
(string->symbol tag))))
(strip-context
(with-syntax ([LINES lines])
#'(module _ "classes-prep.rkt"
. LINES)))))
(define-syntax (make-cond stx)
(syntax-case stx ()
[(_ ID VAL) #'(eq? ID VAL)] ;; I believe `eq?` is OK because a codepoint is a fixnum
[(_ ID LVAL RVAL) #'(<= LVAL ID RVAL)]))
(provide (rename-out [mb #%module-begin])
(except-out (all-from-out racket/base) #%module-begin))
(define-syntax (mb stx)
(syntax-case stx ()
[(_ (VALS RES) ...)
(with-syntax ([F (datum->syntax stx 'f)])
#'(#%module-begin
(provide F)
(define (F x)
(cond [(make-cond x . VALS) 'RES] ...))))]))

@ -0,0 +1,46 @@
#lang racket/base
(provide (all-defined-out))
;; The following break classes are handled by the pair table
(define OP 0) ;; Opening punctuation
(define CL 1) ;; Closing punctuation
(define CP 2) ;; Closing parenthesis
(define QU 3) ;; Ambiguous quotation
(define GL 4) ;; Glue
(define NS 5) ;; Non-starters
(define EX 6) ;; Exclamation/Interrogation
(define SY 7) ;; Symbols allowing break after
(define IS 8) ;; Infix separator
(define PR 9) ;; Prefix
(define PO 10) ;; Postfix
(define NU 11) ;; Numeric
(define AL 12) ;; Alphabetic
(define HL 13) ;; Hebrew Letter
(define ID 14) ;; Ideographic
(define IN 15) ;; Inseparable characters
(define HY 16) ;; Hyphen
(define BA 17) ;; Break after
(define BB 18) ;; Break before
(define B2 19) ;; Break on either side (but not pair)
(define ZW 20) ;; Zero-width space
(define CM 21) ;; Combining marks
(define WJ 22) ;; Word joiner
(define H2 23) ;; Hangul LV
(define H3 24) ;; Hangul LVT
(define JL 25) ;; Hangul L Jamo
(define JV 26) ;; Hangul V Jamo
(define JT 27) ;; Hangul T Jamo
(define RI 28) ;; Regional Indicator
;; The following break classes are not handled by the pair table
(define AI 29) ;; Ambiguous (Alphabetic or Ideograph)
(define BK 30) ;; Break (mandatory)
(define CB 31) ;; Contingent break
(define CJ 32) ;; Conditional Japanese Starter
(define CR 33) ;; Carriage return
(define LF 34) ;; Line feed
(define NL 35) ;; Next line
(define SA 36) ;; South-East Asian
(define SG 37) ;; Surrogates
(define SP 38) ;; Space
(define XX 39) ;; Unknown

@ -0,0 +1,76 @@
#lang at-exp debug racket/base
(require racket/string racket/match)
(provide (all-defined-out))
(define DI_BRK 0) ; Direct break opportunity
(define IN_BRK 1) ; Indirect break opportunity
(define CI_BRK 2) ; Indirect break opportunity for combining marks
(define CP_BRK 3) ; Prohibited break for combining marks
(define PR_BRK 4) ; Prohibited break
(define/match (tok->val tok)
[("^") PR_BRK]
[("@") CP_BRK]
[("_") DI_BRK]
[("%") IN_BRK]
[("#") CI_BRK])
(define/match (make-pair-table . strs)
[((cons header recs))
(for/list ([rec (in-list recs)]
#:unless (regexp-match #px"^\\s*$" rec))
(for/list ([tok (in-list (string-split (string-trim rec #px"\\w+")))])
(tok->val tok)))])
(define/match (make-pair-hash . strs)
[((list* header _ recs))
(define tok-pat #px"\\w+")
(for*/hash ([right (in-list (map string->symbol (string-split header)))]
[rec (in-list recs)]
#:unless (regexp-match #px"^\\s*$" rec)
[left (in-value (string->symbol (car (regexp-match tok-pat rec))))]
[tok (in-list (string-split (string-trim rec tok-pat)))])
(values (cons left right) (tok->val tok)))])
;; table copied from https://www.unicode.org/reports/tr14/tr14-37.html#Table2
(define table-strs
@list|{
OP CL CP QU GL NS EX SY IS PR PO NU AL HL ID IN HY BA BB B2 ZW CM WJ H2 H3 JL JV JT RI EB EM ZWJ
OP ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ @ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^
CL _ ^ ^ % % ^ ^ ^ ^ % % _ _ _ _ _ % % _ _ ^ # ^ _ _ _ _ _ _ _ _ %
CP _ ^ ^ % % ^ ^ ^ ^ % % % % % _ _ % % _ _ ^ # ^ _ _ _ _ _ _ _ _ %
QU ^ ^ ^ % % % ^ ^ ^ % % % % % % % % % % % ^ # ^ % % % % % % % % %
GL % ^ ^ % % % ^ ^ ^ % % % % % % % % % % % ^ # ^ % % % % % % % % %
NS _ ^ ^ % % % ^ ^ ^ _ _ _ _ _ _ _ % % _ _ ^ # ^ _ _ _ _ _ _ _ _ %
EX _ ^ ^ % % % ^ ^ ^ _ _ _ _ _ _ % % % _ _ ^ # ^ _ _ _ _ _ _ _ _ %
SY _ ^ ^ % % % ^ ^ ^ _ _ % _ % _ _ % % _ _ ^ # ^ _ _ _ _ _ _ _ _ %
IS _ ^ ^ % % % ^ ^ ^ _ _ % % % _ _ % % _ _ ^ # ^ _ _ _ _ _ _ _ _ %
PR % ^ ^ % % % ^ ^ ^ _ _ % % % % _ % % _ _ ^ # ^ % % % % % _ % % %
PO % ^ ^ % % % ^ ^ ^ _ _ % % % _ _ % % _ _ ^ # ^ _ _ _ _ _ _ _ _ %
NU % ^ ^ % % % ^ ^ ^ % % % % % _ % % % _ _ ^ # ^ _ _ _ _ _ _ _ _ %
AL % ^ ^ % % % ^ ^ ^ % % % % % _ % % % _ _ ^ # ^ _ _ _ _ _ _ _ _ %
HL % ^ ^ % % % ^ ^ ^ % % % % % _ % % % _ _ ^ # ^ _ _ _ _ _ _ _ _ %
ID _ ^ ^ % % % ^ ^ ^ _ % _ _ _ _ % % % _ _ ^ # ^ _ _ _ _ _ _ _ _ %
IN _ ^ ^ % % % ^ ^ ^ _ _ _ _ _ _ % % % _ _ ^ # ^ _ _ _ _ _ _ _ _ %
HY _ ^ ^ % _ % ^ ^ ^ _ _ % _ _ _ _ % % _ _ ^ # ^ _ _ _ _ _ _ _ _ %
BA _ ^ ^ % _ % ^ ^ ^ _ _ _ _ _ _ _ % % _ _ ^ # ^ _ _ _ _ _ _ _ _ %
BB % ^ ^ % % % ^ ^ ^ % % % % % % % % % % % ^ # ^ % % % % % % % % %
B2 _ ^ ^ % % % ^ ^ ^ _ _ _ _ _ _ _ % % _ ^ ^ # ^ _ _ _ _ _ _ _ _ %
ZW _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ ^ _ _ _ _ _ _ _ _ _ _ _
CM % ^ ^ % % % ^ ^ ^ % % % % % _ % % % _ _ ^ # ^ _ _ _ _ _ _ _ _ %
WJ % ^ ^ % % % ^ ^ ^ % % % % % % % % % % % ^ # ^ % % % % % % % % %
H2 _ ^ ^ % % % ^ ^ ^ _ % _ _ _ _ % % % _ _ ^ # ^ _ _ _ % % _ _ _ %
H3 _ ^ ^ % % % ^ ^ ^ _ % _ _ _ _ % % % _ _ ^ # ^ _ _ _ _ % _ _ _ %
JL _ ^ ^ % % % ^ ^ ^ _ % _ _ _ _ % % % _ _ ^ # ^ % % % % _ _ _ _ %
JV _ ^ ^ % % % ^ ^ ^ _ % _ _ _ _ % % % _ _ ^ # ^ _ _ _ % % _ _ _ %
JT _ ^ ^ % % % ^ ^ ^ _ % _ _ _ _ % % % _ _ ^ # ^ _ _ _ _ % _ _ _ %
RI _ ^ ^ % % % ^ ^ ^ _ _ _ _ _ _ _ % % _ _ ^ # ^ _ _ _ _ _ % _ _ %
EB _ ^ ^ % % % ^ ^ ^ _ % _ _ _ _ % % % _ _ ^ # ^ _ _ _ _ _ _ _ % %
EM _ ^ ^ % % % ^ ^ ^ _ % _ _ _ _ % % % _ _ ^ # ^ _ _ _ _ _ _ _ _ %
ZWJ _ ^ ^ % % % ^ ^ ^ _ _ _ _ _ % _ % % _ _ ^ # ^ _ _ _ _ _ _ % % %
}|)
(define pair-table (apply make-pair-table table-strs))
(define pair-hash (apply make-pair-hash table-strs))
(module+ main
pair-table)

File diff suppressed because it is too large Load Diff
Loading…
Cancel
Save