不到60行代码,用Racket实现wc命令

Racket语言开发命令行程序十分简单,而且支持跨平台编译,我们可以用它来开发各种方便的工具程序。下面实现了一个完整的wc程序,只需50多行代码就完成了。

#lang racket
(require racket/cmdline)

(define show-bytes (make-parameter #t))
(define show-lines (make-parameter #t))
(define show-words (make-parameter #t))
(define files (make-parameter null))

(define re-chan (make-channel))

(define cmd-line
    (command-line
        #:program "r-wc"
        #:once-any
            [("-c" "--bytes" "--chars") "The number of bytes" (show-bytes #t) (show-lines #f) (show-words #f)]
            [("-l" "--lines") "The number of lines" (show-lines #t) (show-bytes #f) (show-words #f)]
            [("-w" "--words") "The muber of words" (show-words #t) (show-bytes #f) (show-lines #f)]
        #:args file (files file)))

(define (do-work)
    (if (null? (files)) (displayln "Please enter the file name")
        (begin
            (for-each scan-file (files))
            (let loop ([finished 0])
                (unless (= finished (length (files)))
                (let ((re (channel-get re-chan)))
                    (match re
                        ((list l w b f)
                        (displayln (append
                            (for/list ([d (list l w b)] [t (list (show-lines) (show-words) (show-bytes) )] #:when t) d) f)))))
                (loop (add1 finished)))))))

(define (scan-file f)
    (thread (lambda ()
        (call-with-input-file f
            (lambda (in)
                (let loop
                    ([a-line (read-line in)] [bytes 0] [lines 0] [words 0])
                    (if (eof-object? a-line) (channel-put re-chan (list lines words bytes f))
                        (let-values (((w b) (parse-line a-line)))
                            (loop (read-line in) (+ bytes b) (add1 lines) (+ words w))))))))))

(define (parse-line a-line)
    (define s (open-input-string a-line))
    (if (null? a-line)
        (values 0 0)
        (let loop ([w 0] [b 0] [pre-char #\a])
            (define bl b)
            (define c (read-char s))
            (unless (eof-object? c) (set! bl (+ (char-utf-8-length c) b)))
            (if (eof-object? c) (values (add1 w) (add1 b))
                (case c
                    [(#\space) (loop (if (char-whitespace? pre-char) w (add1 w)) (add1 b) c)]
                    [else (loop w bl c)])))))

(do-work)

然后用raco生成一个可执行程序, 名字就叫r-wc,和wc稍微区分一下:

raco exe -o r-wc r-wc.rkt

就像使用wc命令一样,

➜  racket ./r-wc -h
usage: r-wc [ <option> ... ] [<file>] ...

<option> is one of

/ -c, --bytes, --chars
|    The number of bytes
| -l, --lines
|    The number of lines
| -w, --words
\    The muber of words
  --help, -h
     Show this help
  --
     Do not treat any remaining argument as a switch (at this level)

 /|\ Brackets indicate mutually exclusive options.

 Multiple single-letter switches can be combined after
 one `-`. For example, `-h-` is the same as `-h --`.

上面的程序执行效率不是很好,主要表现在读取文件,和解析每一行时,占用的时间和内存太大。Racket提供了一个特性Sequences ,它可以与for 形式一样遍历数据结果,但是效率却高很多,因此可以使用Sequences来重构我们的程序,将上文的两个函数scan-fileparse-line 修改为:

(define (scan-file f)
    (thread (lambda ()
        (define bytes 0)
        (define lines 0)
        (define words 0)
        (call-with-input-file f
            (lambda (in)
                (for ([a-line (in-bytes-lines in)])
                    (let-values (((w b) (parse-line-seq a-line)))
                        (set! bytes (+ bytes b))
                        (set! lines (add1 lines))
                        (set! words (+ words w))))
                (channel-put re-chan (list lines words bytes f)))))))

(define (parse-line a-line)
    (define pre-char #\a)
    (define w 0)
    (define b 0)
    (if (null? a-line) (values 0 0)
        (for ([c (in-bytes a-line)])
             (case c
                [(#\space) (unless (char-whitespace? pre-char) (set! w (add1 w))) (set! b (add1 b))]
                [else (set! b (add1 b))])
             (set! pre-char c)))
    (values (add1 w) (add1 b)))

两个版本对比一下(文件大小100MB):

  Old version Sequences version wc
时间 4.6s 0.8s 0.4s
峰值内存 132M 115M 1M

经过Sequences重构之后,执行时间有了巨大的提升,并且最大内存使用量也有所减少,不过与wc还是有不少差距。希望有一天能够找到超越wc的方法。


欢迎加入Racket 隐修会 :731859928(QQ)