[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
uri encode (was: quick search doesn't work for waei:, etc.)
>> On Wed, 09 Jul 2003 23:06:11 +0900
>> 「土」== tsuchiya@pine.kuee.kyoto-u.ac.jp (TSUCHIYA Masatoshi) said as follows:
土> しかし,本来,url は文字列として扱われるべきものですから,内部的に
土> は常に文字列として扱い,ネットワークに送信する(w3m コマンドを呼び
土> 出す)直前に encode するべきだと思います.しかし,それを実現するた
土> めには,その url を encode するのに適切な coding-system を常に文字
土> 列とペアで保持しておくように改善する必要があります.このような大き
土> な変更を emacs-w3m-1_3 枝に加えるのは,とても面倒なので,とりあえ
土> ず先送りして,adhoc な対処で済ませています.
山> text-props は使えるかしら。
土> 少し検討してみます.
検討してみました.意外と簡単な変更で出来たのですが,今度は別の問題で困っ
ています.
つまり,arrived DB には文字列の URI を記録するべきか,それとも,encode
済みの octet 列の URI を記録するべきか,という問題です.添付のパッチを
適用して,
http://example/日本語ファイル名
にアクセスすると,実際には
(w3m-uri-transfer-encode-string "http://example/日本語ファイル名")
=> "http://example/%93%fa%96%7b%8c%ea%83%74%83%40%83%43%83%8b%96%bc"
にアクセスしにいくので,encode 済みの方が real URL という扱いになり,
arrived DB には両方の URL の記録が残ります.
ユーザーの入力を考えると,emacs-w3m 内部では URI は文字列として扱うべ
きだとは思いますから,必然的に arrived DB も文字列の場合だけを記録に残
すようにするべきだ,ということになるのですけれど,そうすると w3m の実
行結果をなんとか解釈し直す必要があって,悩んでいます.
何か,簡単で良い方法はないでしょうか?
--
土屋 雅稔 ( TSUCHIYA Masatoshi )
Index: w3m-form.el
===================================================================
RCS file: /storage/cvsroot/emacs-w3m/w3m-form.el,v
retrieving revision 1.106
diff -u -r1.106 w3m-form.el
--- w3m-form.el 18 Jun 2003 02:07:17 -0000 1.106
+++ w3m-form.el 23 Jul 2003 15:22:16 -0000
@@ -448,11 +448,11 @@
(enctype :case-ignore)
(charset :case-ignore))
(when action
- (setq action (w3m-url-transfer-encode-string
- (w3m-decode-anchor-string action)
- (if charset
- (w3m-charset-to-coding-system charset)
- w3m-current-coding-system))))
+ (setq action (w3m-decode-anchor-string action))
+ (setf (w3m-uri-transfer-coding-system action)
+ (if charset
+ (w3m-charset-to-coding-system charset)
+ w3m-current-coding-system)))
(setq forms
(cons
(cons
Index: w3m-util.el
===================================================================
RCS file: /storage/cvsroot/emacs-w3m/w3m-util.el,v
retrieving revision 1.43
diff -u -r1.43 w3m-util.el
--- w3m-util.el 15 Jul 2003 04:26:50 -0000 1.43
+++ w3m-util.el 23 Jul 2003 15:22:16 -0000
@@ -345,15 +345,7 @@
(sort (w3m-list-buffers t)
(function w3m-buffer-name-lessp))))
-
;;; Miscellaneous:
-(defconst w3m-url-fallback-base "http:///")
-(defconst w3m-url-invalid-regexp "\\`http:///")
-
-(defsubst w3m-url-valid (url)
- (and url (not (string-match w3m-url-invalid-regexp url))
- url))
-
(defmacro w3m-tag-regexp-of (&rest names)
"Return a regexp string, not a funtion form. A regexp should match tags
which are started with \"<\" and one of NAMES. NAMES should be string
@@ -378,6 +370,40 @@
(+ (* (- (car end) (car start)) 65536)
(cadr end)
(- (cadr start))))
+
+;;; URI handling functions:
+(defsubst w3m-uri-set-properties (uri properties)
+ "Add properties to a given URI, and return it."
+ (set-text-properties 0 (length uri) nil uri)
+ (put-text-property 0 (length uri) 'w3m-uri-properties properties uri))
+
+(defsetf w3m-uri-properties w3m-uri-set-properties)
+(defsubst w3m-uri-properties (uri)
+ "Return the properties associated to a given URI."
+ (get-text-property 0 'w3m-uri-properties uri))
+
+(defsubst w3m-uri-put (uri propname value)
+ "Store URI's PROPNAME property with value VALUE."
+ (setf (w3m-uri-properties uri)
+ (plist-put (w3m-uri-properties uri) propname value))
+ value)
+
+(defsetf w3m-uri-get w3m-uri-put)
+(defsubst w3m-uri-get (uri propname)
+ "Return the value of URI's PROPNAME property."
+ (plist-get (w3m-uri-properties uri) propname))
+
+(defmacro w3m-uri-transfer-coding-system (uri)
+ "Return the coding system to transfer a given URI."
+ `(w3m-uri-get ,uri 'transfer-coding-system))
+
+;; MEMO: 以下の w3m-url-* 関数群は,徐々に w3m-uri-* に名前を統一する予定.
+(defconst w3m-url-fallback-base "http:///")
+(defconst w3m-url-invalid-regexp "\\`http:///")
+
+(defsubst w3m-url-valid (url)
+ (and url (not (string-match w3m-url-invalid-regexp url))
+ url))
(defsubst w3m-url-local-p (url)
"If URL points a file on the local system, return non-nil value.
Index: w3m.el
===================================================================
RCS file: /storage/cvsroot/emacs-w3m/w3m.el,v
retrieving revision 1.831
diff -u -r1.831 w3m.el
--- w3m.el 17 Jul 2003 14:52:36 -0000 1.831
+++ w3m.el 23 Jul 2003 15:22:16 -0000
@@ -2328,29 +2328,34 @@
w3m-coding-system
'iso-2022-7bit))))
-(defsubst w3m-url-transfer-encode-string (url &optional coding)
- "Encode all non-ASCII characters included in URL to sequences of
+;; MEMO: query 部分は,上の w3m-url-encode-string() で encode し,残る
+;; 部分のみを送信用に encode するべきなのか?
+(defsubst w3m-uri-transfer-encode-string (uri &optional coding)
+ "Encode all non-ASCII characters included in URI to sequences of
escaped octets in the specified coding system.
-This function is designed for conversion for safe transmission of URL.
+This function is designed for conversion for safe transmission of URI.
Therefore, this function handles only non-ASCII characters that can
not be transmitted safely with network streams. In general, you
should use `w3m-url-encode-string' instead of this."
(let ((start 0)
(buf))
- (while (string-match "[^\x21-\x7e]+" url start)
+ (unless coding
+ (setq coding (or (w3m-uri-transfer-coding-system uri)
+ w3m-default-coding-system
+ w3m-coding-system
+ 'iso-2022-7bit)))
+ (while (string-match "[^\x21-\x7e]+" uri start)
(setq buf
(cons (apply 'concat
(mapcar
(lambda (c) (format "%%%02x" c))
- (append (encode-coding-string
- (match-string 0 url)
- (or coding
- w3m-current-coding-system)))))
- (cons (substring url start (match-beginning 0))
+ (append (encode-coding-string (match-string 0 uri)
+ coding))))
+ (cons (substring uri start (match-beginning 0))
buf))
start (match-end 0)))
(apply 'concat
- (nreverse (cons (substring url start) buf)))))
+ (nreverse (cons (substring uri start) buf)))))
;;; HTML character entity handling:
@@ -2456,8 +2461,7 @@
(while (re-search-forward "<a[ \t\r\f\n]+" nil t)
(setq start (match-beginning 0))
(setq prenames (get-text-property start 'w3m-name-anchor))
- (w3m-parse-attributes (href name charset
- (rel :case-ignore) (hseq :integer))
+ (w3m-parse-attributes (href name (rel :case-ignore) (hseq :integer))
(when rel
(setq rel (split-string rel))
(cond
@@ -2472,18 +2476,11 @@
(when (re-search-forward "[ \t\r\f\n]*\\(</a>\\)" nil t)
(setq end (match-beginning 0))
(delete-region (match-beginning 1) (match-end 1))
- (setq href (w3m-expand-url (w3m-decode-anchor-string href)))
- (setq href (if (and (string-match w3m-url-components-regexp href)
- (match-beginning 8))
- (concat (w3m-url-transfer-encode-string
- (substring href 0 (match-beginning 8))
- (w3m-charset-to-coding-system charset))
- "#" (match-string 9 href))
- (w3m-url-transfer-encode-string
- href
- (w3m-charset-to-coding-system charset)))
+ (setq href (w3m-expand-url (w3m-decode-anchor-string href))
hseq (or (and (null hseq) 0) (abs hseq))
w3m-max-anchor-sequence (max hseq w3m-max-anchor-sequence))
+ (setf (w3m-uri-transfer-coding-system href)
+ w3m-current-coding-system)
(w3m-add-text-properties start end
(list 'face (if (w3m-arrived-p href)
'w3m-arrived-anchor-face
@@ -3454,7 +3451,8 @@
(w3m-process-do-with-temp-buffer
(success (progn
(setq w3m-current-url url
- url (w3m-url-strip-authinfo url))
+ url (w3m-uri-transfer-encode-string
+ (w3m-url-strip-authinfo url)))
(w3m-process-start handler
w3m-command
(append w3m-command-arguments
@@ -3568,7 +3566,7 @@
w3m-command-arguments
(w3m-w3m-expand-arguments
w3m-dump-head-source-command-arguments)
- (list url))))
+ (list (w3m-uri-transfer-encode-string url)))))
(w3m-message "Reading %s...done" url)
(when result
(goto-char (point-min))
@@ -6006,11 +6004,7 @@
nil ;; referer
nil ;; handler
t)) ;; qsearch
- (set-text-properties 0 (length url) nil url)
(setq url (w3m-uri-replace url))
- (unless (or (w3m-url-local-p url)
- (string-match "\\`about:" url))
- (setq url (w3m-url-transfer-encode-string url w3m-default-coding-system)))
(cond
;; process mailto: protocol
((string-match "\\`mailto:" url)