diff --git a/debug.rkt b/debug.rkt index ce739fc..f1bd764 100644 --- a/debug.rkt +++ b/debug.rkt @@ -3,7 +3,6 @@ (require racket/string) (require racket/format) -(require "readability.rkt") (provide (all-defined-out)) diff --git a/decode.rkt b/decode.rkt index c4a2d4f..f755f93 100644 --- a/decode.rkt +++ b/decode.rkt @@ -2,10 +2,12 @@ (require racket/contract) (require racket/list) (require racket/string) - +(require (only-in racket/format ~a)) +(require (only-in racket/bool nor)) +(require (only-in xml xexpr/c)) (module+ test (require rackunit)) -(require "tools.rkt") +(require "tools.rkt" "library/html.rkt") (provide (all-defined-out)) ;; Find adjacent newline characters in a list and merge them into one item @@ -30,7 +32,7 @@ ;; otherwise peel off elements up to the next newline, append them to accumulator, ;; and recurse on the rest (really-merge-newlines (dropf remainder not-newline?) - (append acc (takef remainder not-newline?))))))) + (append acc (takef remainder not-newline?))))))) (cond [(list? x) (really-merge-newlines (map merge-newlines x))] @@ -41,18 +43,35 @@ '(p "\n" "foo" "\n\n" "bar" (em "\n\n\n")))) -; Mon Aug 5: start here +;; is the named-xexpr a block element (as opposed to inline) +(define/contract (block-xexpr? nx) + (named-xexpr? . -> . boolean?) + ;; this is a change in behavior since first pollen + ;; blocks are only the ones on the html block tag list. + ;; todo: make sure this is what I want. + ;; this is, however, more consistent with browser behavior + ;; (browsers assume that tags are inline by default) + (as-boolean (in block-tags (named-xexpr-name nx)))) -; A block-xexpr is a named expression that's not on the inline list -; todo: bear in mind that browsers take the opposite view: -; that only elements on the block list are blocks -; and otherwise are treated as inline -(define (block-xexpr? x) - (and (named-xexpr? x) (not (in? inline-tags (car x))))) +(module+ test + (check-true (block-xexpr? '(p "foo"))) + (check-true (block-xexpr? '(div "foo"))) + (check-false (block-xexpr? '(em "foo"))) + (check-false (block-xexpr? '(barfoo "foo")))) +;; start here Tues 6 +;; default content decoder for pollen +(define/contract (decode nx) + ;; use xexpr/c for contact because it gives better error messages + (xexpr/c . -> . named-xexpr?) + nx + + ) +;(decode `(p ((key "value")) ,decode)) -; default content decoder for pollen +#| +;; default content decoder for pollen (define/contract (decode x) (named-xexpr? . -> . named-xexpr?) @@ -67,7 +86,7 @@ decoded-x))] [(xexpr-content? x) ; a list of xexprs (let ([x (prep-paragraph-flow x)]) - (map &decode (if (any paragraph-break? x) ; need this condition to prevent infinite recursion + (map &decode (if (ormap paragraph-break? x) ; need this condition to prevent infinite recursion (map wrap-paragraph (splitf-at* x paragraph-break?)) ; split into ¶¶ x)))] [(string? x) (typogrify x)] @@ -86,3 +105,4 @@ ;todo: improve this error message, more specific location ; now, it just spits out the whole defective content (error (format "decode: ~v not a full named-xexpr" x))))) +|# \ No newline at end of file diff --git a/library/html.rkt b/library/html.rkt index 06dc9f6..8be96b8 100644 --- a/library/html.rkt +++ b/library/html.rkt @@ -1,14 +1,17 @@ -#lang racket +#lang racket/base -(require (planet mb/pollen/syntax)) +(require "../syntax.rkt") -; for now, body is deemed a block, not inline -;todo: is this legit? Why is body inline? +;; for now, body is deemed a block, not inline +;; todo: is this legit? Why is body inline? (define block-tags '(address article aside audio blockquote body canvas dd div dl fieldset figcaption figure footer form h1 h2 h3 h4 h5 h6 header hgroup noscript ol output p pre section table tfoot ul video)) -; for now, map is omitted because it's a Racket keyword -; for now, style, script, and link are omitted because they shouldn't be wrapped + +;; for now, map is omitted because it's a Racket keyword +;; for now, style, script, and link are omitted because they shouldn't be wrapped +;; todo: figure out how to resolve this. should be able to use html version of map. +;; possibly prefixed? Probably a good idea so it doesn't collide with user namespace (define inline-tags '(a abbr acronym applet area b base basefont bdo big br button caption center cite code col colgroup del dir dfn dt em embed font frame framesethead hr html i iframe img input ins isindex kbd label legend li menu meta noframes object optgroup option param q s samp select small span strike strong sub sup tbody td textarea th thead title tr tt u var xmp)) diff --git a/readability.rkt b/readability.rkt index 2dbd402..8a510c7 100644 --- a/readability.rkt +++ b/readability.rkt @@ -1,8 +1,11 @@ #lang racket/base (require racket/contract) -(require (only-in racket/list empty?)) +(require (only-in racket/list empty? range)) (require (only-in racket/format ~a)) +(require (only-in racket/string string-join)) +(require (only-in racket/vector vector-member)) (module+ test (require rackunit)) +(require "debug.rkt") (provide (all-defined-out)) @@ -10,6 +13,7 @@ (define/contract (as-string x) (any/c . -> . string?) (cond + [(string? x) x] [(empty? x) ""] [(symbol? x) (symbol->string x)] [(number? x) (number->string x)] @@ -18,6 +22,7 @@ [else (error (format "Can't make ~a into string" x))])) (module+ test + (check-equal? (as-string "foo") "foo") (check-equal? (as-string '()) "") (check-equal? (as-string 'foo) "foo") (check-equal? (as-string 123) "123") @@ -26,6 +31,12 @@ (check-equal? (as-string #\¶) "¶")) +;; general way of coercing to symbol +(define (as-symbol thing) + ; todo: on bad input, it will pop a string error rather than symbol error + (string->symbol (as-string thing))) + + ;; general way of coercing to a list (define (as-list x) (any/c . -> . list?) @@ -39,6 +50,23 @@ (check-equal? (as-list (list->vector '(1 2 3))) '(1 2 3)) (check-equal? (as-list "foo") (list "foo"))) + +;; general way of coercing to boolean +(define (as-boolean x) + (any/c . -> . boolean?) + ;; in Racket, everything but #f is true + (if x #t #f)) + +(module+ test + (check-true (as-boolean #t)) + (check-false (as-boolean #f)) + (check-true (as-boolean "#f")) + (check-true (as-boolean "foo")) + (check-true (as-boolean '())) + (check-true (as-boolean '(1 2 3)))) + + + ;; general way of asking for length (define (len x) (any/c . -> . integer?) @@ -59,5 +87,93 @@ (check-not-equal? (len 'fo) 3) ; len 2 (check-equal? (len (list->vector '(1 2 3))) 3) (check-not-equal? (len (list->vector '(1 2))) 3) ; len 2 - (check-equal? (len (make-hash '((a 1) (b 2) (c 3)))) 3) - (check-not-equal? (len (make-hash '((a 1) (b 2) (b 3)))) 3)) ; len 2 \ No newline at end of file + (check-equal? (len (make-hash '((a . 1) (b . 2) (c . 3)))) 3) + (check-not-equal? (len (make-hash '((a . 1) (b . 2)))) 3)) ; len 2 + + + +;; general way of fetching an item from a container +(define/contract (get container item [up-to #f]) + ((any/c any/c) ((λ(i)(or (integer? i) (and (symbol? i) (equal? i 'end))))) + . ->* . any/c) + + (define (sliceable-container? container) + (ormap (λ(proc) (proc container)) (list list? string? vector?))) + + (when (sliceable-container? container) + (set! up-to + (cond + ;; treat negative lengths as offset from end (Python style) + [(and (integer? up-to) (< up-to 0)) (+ (len container) up-to)] + ;; 'end slices to the end + [(equal? up-to 'end) (len container)] + ;; default to slice length of 1 (i.e, single-item retrieval) + [(equal? up-to #f) (add1 item)] + [else up-to]))) + + (define result (cond + ;; for sliceable containers, make a slice + [(list? container) (for/list ([i (range item up-to)]) + (list-ref container i))] + [(vector? container) (for/vector ([i (range item up-to)]) + (vector-ref container i))] + [(string? container) (substring container item up-to)] + [(symbol? container) (as-symbol (get (as-string container) item up-to))] + ;; for hash, just get item + [(hash? container) (hash-ref container item)] + [else #f])) + + ;; don't return single-item results inside a list + (if (and (sliceable-container? result) (= (len result) 1)) + (car (as-list result)) + result)) + +(module+ test + (check-equal? (get '(0 1 2 3 4 5) 2) 2) + (check-equal? (get '(0 1 2 3 4 5) 0 2) '(0 1)) + (check-equal? (get '(0 1 2 3 4 5) 2 -1) '(2 3 4)) + (check-equal? (get '(0 1 2 3 4 5) 2 'end) '(2 3 4 5)) + (check-equal? (get (list->vector '(0 1 2 3 4 5)) 2) 2) + (check-equal? (get (list->vector'(0 1 2 3 4 5)) 0 2) (list->vector '(0 1))) + (check-equal? (get (list->vector'(0 1 2 3 4 5)) 2 -1) (list->vector '(2 3 4))) + (check-equal? (get (list->vector'(0 1 2 3 4 5)) 2 'end) (list->vector '(2 3 4 5))) + (check-equal? (get "purple" 2) "r") + (check-equal? (get "purple" 0 2) "pu") + (check-equal? (get "purple" 2 -1) "rpl") + (check-equal? (get "purple" 2 'end) "rple") + (check-equal? (get 'purple 2) 'r) + (check-equal? (get 'purple 0 2) 'pu) + (check-equal? (get 'purple 2 -1) 'rpl) + (check-equal? (get 'purple 2 'end) 'rple) + (check-equal? (get (make-hash '((a . 1) (b . 2) (c . 3))) 'a) 1)) + +;; general way of testing for membership (à la Python 'in') +(define/contract (in container item) + (any/c any/c . -> . any/c) + (cond + [(list? container) (member item container)] ; returns #f or sublist beginning with item + [(vector? container) (vector-member item container)] ; returns #f or zero-based item index + [(hash? container) + (and (hash-has-key? container item) (get container item))] ; returns #f or hash value + [(string? container) (let ([result (in (map as-string (string->list container)) (as-string item))]) + (if result + (string-join result "") + #f))] ; returns #f or substring beginning with item + [(symbol? container) (let ([result (in (as-string container) (as-string item))]) + (if result + (as-symbol result) + result))] ; returns #f or subsymbol (?!) beginning with item + [else #f])) + +(module+ test + (check-equal? (in '(1 2 3) 2) '(2 3)) + (check-false (in '(1 2 3) 4)) + (check-equal? (in (list->vector '(1 2 3)) 2) 1) + (check-false (in (list->vector '(1 2 3)) 4)) + (check-equal? (in (make-hash '((a . 1) (b . 2) (c . 3))) 'a) 1) + (check-false (in (make-hash '((a . 1) (b . 2) (c . 3))) 'x)) + (check-equal? (in "foobar" "o") "oobar") + (check-false (in "foobar" "z")) + (check-equal? (in 'foobar 'o) 'oobar) + (check-false (in 'foobar 'z)) + (check-false (in #\F "F"))) \ No newline at end of file diff --git a/tools.rkt b/tools.rkt index 9e89a6f..df69395 100644 --- a/tools.rkt +++ b/tools.rkt @@ -4,7 +4,7 @@ (require (only-in racket/format ~a)) (require (only-in racket/list empty empty? second filter-not splitf-at takef dropf)) (require (only-in racket/string string-join)) -(require (only-in xml xexpr?)) +(require (only-in xml xexpr? xexpr/c)) (require "readability.rkt" "debug.rkt") (provide (all-defined-out) (all-from-out "readability.rkt" "debug.rkt")) @@ -113,6 +113,8 @@ (check-false (named-xexpr? '("p" "foo" "bar"))) ; no name (check-false (named-xexpr? '(p 123)))) ; content is a number + + ;; helper for comparison of values ;; normal function won't work for this. Has to be syntax-rule (define-syntax-rule (values->list vs) @@ -154,6 +156,29 @@ (values->list (values 'p '((key "value")) '("foo"))))) +;; convenience functions to retrieve only one part of named-xexpr +(define (named-xexpr-name nx) + (named-xexpr? . -> . symbol?) + (define-values (name attr content) (break-named-xexpr nx)) + name) + +(define (named-xexpr-attr nx) + (named-xexpr? . -> . xexpr-attr?) + (define-values (name attr content) (break-named-xexpr nx)) + attr) + +(define (named-xexpr-content nx) + (named-xexpr? . -> . xexpr-content?) + (define-values (name attr content) (break-named-xexpr nx)) + content) + +(module+ test + (check-equal? (named-xexpr-name '(p ((key "value"))"foo" "bar" (em "square"))) 'p) + (check-equal? (named-xexpr-attr '(p ((key "value"))"foo" "bar" (em "square"))) '((key "value"))) + (check-equal? (named-xexpr-content '(p ((key "value"))"foo" "bar" (em "square"))) + '("foo" "bar" (em "square")))) + + ;; apply filter proc recursively (define/contract (filter-tree proc tree) (procedure? list? . -> . list?)