Ruby 3.2.2p53 (2023-03-30 revision e51014f9c05aa65cbf203442d37fef7c12390015)
string.c
1/**********************************************************************
2
3 string.c -
4
5 $Author$
6 created at: Mon Aug 9 17:12:58 JST 1993
7
8 Copyright (C) 1993-2007 Yukihiro Matsumoto
9 Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10 Copyright (C) 2000 Information-technology Promotion Agency, Japan
11
12**********************************************************************/
13
14#include "ruby/internal/config.h"
15
16#include <ctype.h>
17#include <errno.h>
18#include <math.h>
19
20#ifdef HAVE_UNISTD_H
21# include <unistd.h>
22#endif
23
24#include "debug_counter.h"
25#include "encindex.h"
26#include "gc.h"
27#include "id.h"
28#include "internal.h"
29#include "internal/array.h"
30#include "internal/compar.h"
31#include "internal/compilers.h"
32#include "internal/encoding.h"
33#include "internal/error.h"
34#include "internal/gc.h"
35#include "internal/numeric.h"
36#include "internal/object.h"
37#include "internal/proc.h"
38#include "internal/re.h"
39#include "internal/sanitizers.h"
40#include "internal/string.h"
41#include "internal/transcode.h"
42#include "probes.h"
43#include "ruby/encoding.h"
44#include "ruby/re.h"
45#include "ruby/util.h"
46#include "ruby_assert.h"
47#include "vm_sync.h"
48
49#if defined HAVE_CRYPT_R
50# if defined HAVE_CRYPT_H
51# include <crypt.h>
52# endif
53#elif !defined HAVE_CRYPT
54# include "missing/crypt.h"
55# define HAVE_CRYPT_R 1
56#endif
57
58#define BEG(no) (regs->beg[(no)])
59#define END(no) (regs->end[(no)])
60
61#undef rb_str_new
62#undef rb_usascii_str_new
63#undef rb_utf8_str_new
64#undef rb_enc_str_new
65#undef rb_str_new_cstr
66#undef rb_usascii_str_new_cstr
67#undef rb_utf8_str_new_cstr
68#undef rb_enc_str_new_cstr
69#undef rb_external_str_new_cstr
70#undef rb_locale_str_new_cstr
71#undef rb_str_dup_frozen
72#undef rb_str_buf_new_cstr
73#undef rb_str_buf_cat
74#undef rb_str_buf_cat2
75#undef rb_str_cat2
76#undef rb_str_cat_cstr
77#undef rb_fstring_cstr
78
81
82/* FLAGS of RString
83 *
84 * 1: RSTRING_NOEMBED
85 * 2: STR_SHARED (== ELTS_SHARED)
86 * 2-6: RSTRING_EMBED_LEN (5 bits == 32)
87 * 5: STR_SHARED_ROOT (RSTRING_NOEMBED==1 && STR_SHARED == 0, there may be
88 * other strings that rely on this string's buffer)
89 * 6: STR_BORROWED (when RSTRING_NOEMBED==1 && klass==0, unsafe to recycle
90 * early, specific to rb_str_tmp_frozen_{acquire,release})
91 * 7: STR_TMPLOCK (set when a pointer to the buffer is passed to syscall
92 * such as read(2). Any modification and realloc is prohibited)
93 *
94 * 8-9: ENC_CODERANGE (2 bits)
95 * 10-16: ENCODING (7 bits == 128)
96 * 17: RSTRING_FSTR
97 * 18: STR_NOFREE (do not free this string's buffer when a String is freed.
98 * used for a string object based on C string literal)
99 * 19: STR_FAKESTR (when RVALUE is not managed by GC. Typically, the string
100 * object header is temporarily allocated on C stack)
101 */
102
103#define RUBY_MAX_CHAR_LEN 16
104#define STR_SHARED_ROOT FL_USER5
105#define STR_BORROWED FL_USER6
106#define STR_TMPLOCK FL_USER7
107#define STR_NOFREE FL_USER18
108#define STR_FAKESTR FL_USER19
109
110#define STR_SET_NOEMBED(str) do {\
111 FL_SET((str), STR_NOEMBED);\
112 if (USE_RVARGC) {\
113 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
114 }\
115 else {\
116 STR_SET_EMBED_LEN((str), 0);\
117 }\
118} while (0)
119#define STR_SET_EMBED(str) FL_UNSET((str), (STR_NOEMBED|STR_NOFREE))
120#if USE_RVARGC
121# define STR_SET_EMBED_LEN(str, n) do { \
122 assert(str_embed_capa(str) > (n));\
123 RSTRING(str)->as.embed.len = (n);\
124} while (0)
125#else
126# define STR_SET_EMBED_LEN(str, n) do { \
127 long tmp_n = (n);\
128 RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\
129 RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
130} while (0)
131#endif
132
133#define STR_SET_LEN(str, n) do { \
134 if (STR_EMBED_P(str)) {\
135 STR_SET_EMBED_LEN((str), (n));\
136 }\
137 else {\
138 RSTRING(str)->as.heap.len = (n);\
139 }\
140} while (0)
141
142#define STR_DEC_LEN(str) do {\
143 if (STR_EMBED_P(str)) {\
144 long n = RSTRING_LEN(str);\
145 n--;\
146 STR_SET_EMBED_LEN((str), n);\
147 }\
148 else {\
149 RSTRING(str)->as.heap.len--;\
150 }\
151} while (0)
152
153static inline bool
154str_enc_fastpath(VALUE str)
155{
156 // The overwhelming majority of strings are in one of these 3 encodings.
157 switch (ENCODING_GET_INLINED(str)) {
158 case ENCINDEX_ASCII_8BIT:
159 case ENCINDEX_UTF_8:
160 case ENCINDEX_US_ASCII:
161 return true;
162 default:
163 return false;
164 }
165}
166
167#define TERM_LEN(str) (str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
168#define TERM_FILL(ptr, termlen) do {\
169 char *const term_fill_ptr = (ptr);\
170 const int term_fill_len = (termlen);\
171 *term_fill_ptr = '\0';\
172 if (UNLIKELY(term_fill_len > 1))\
173 memset(term_fill_ptr, 0, term_fill_len);\
174} while (0)
175
176#define RESIZE_CAPA(str,capacity) do {\
177 const int termlen = TERM_LEN(str);\
178 RESIZE_CAPA_TERM(str,capacity,termlen);\
179} while (0)
180#define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
181 if (STR_EMBED_P(str)) {\
182 if (str_embed_capa(str) < capacity + termlen) {\
183 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
184 const long tlen = RSTRING_LEN(str);\
185 memcpy(tmp, RSTRING_PTR(str), tlen);\
186 RSTRING(str)->as.heap.ptr = tmp;\
187 RSTRING(str)->as.heap.len = tlen;\
188 STR_SET_NOEMBED(str);\
189 RSTRING(str)->as.heap.aux.capa = (capacity);\
190 }\
191 }\
192 else {\
193 assert(!FL_TEST((str), STR_SHARED)); \
194 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
195 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
196 RSTRING(str)->as.heap.aux.capa = (capacity);\
197 }\
198} while (0)
199
200#define STR_SET_SHARED(str, shared_str) do { \
201 if (!FL_TEST(str, STR_FAKESTR)) { \
202 assert(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
203 assert(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
204 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
205 FL_SET((str), STR_SHARED); \
206 FL_SET((shared_str), STR_SHARED_ROOT); \
207 if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
208 FL_SET_RAW((shared_str), STR_BORROWED); \
209 } \
210} while (0)
211
212#define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
213#define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
214/* TODO: include the terminator size in capa. */
215
216#define STR_ENC_GET(str) get_encoding(str)
217
218#if !defined SHARABLE_MIDDLE_SUBSTRING
219# define SHARABLE_MIDDLE_SUBSTRING 0
220#endif
221#if !SHARABLE_MIDDLE_SUBSTRING
222#define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
223#else
224#define SHARABLE_SUBSTRING_P(beg, len, end) 1
225#endif
226
227
228static inline long
229str_embed_capa(VALUE str)
230{
231#if USE_RVARGC
232 return rb_gc_obj_slot_size(str) - offsetof(struct RString, as.embed.ary);
233#else
234 return RSTRING_EMBED_LEN_MAX + 1;
235#endif
236}
237
238bool
239rb_str_reembeddable_p(VALUE str)
240{
241 return !FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
242}
243
244static inline size_t
245rb_str_embed_size(long capa)
246{
247 return offsetof(struct RString, as.embed.ary) + capa;
248}
249
250size_t
251rb_str_size_as_embedded(VALUE str)
252{
253 size_t real_size;
254#if USE_RVARGC
255 if (STR_EMBED_P(str)) {
256 real_size = rb_str_embed_size(RSTRING(str)->as.embed.len) + TERM_LEN(str);
257 }
258 /* if the string is not currently embedded, but it can be embedded, how
259 * much space would it require */
260 else if (rb_str_reembeddable_p(str)) {
261 real_size = rb_str_embed_size(RSTRING(str)->as.heap.aux.capa) + TERM_LEN(str);
262 }
263 else {
264#endif
265 real_size = sizeof(struct RString);
266#if USE_RVARGC
267 }
268#endif
269 return real_size;
270}
271
272static inline bool
273STR_EMBEDDABLE_P(long len, long termlen)
274{
275#if USE_RVARGC
276 return rb_gc_size_allocatable_p(rb_str_embed_size(len + termlen));
277#else
278 return len <= RSTRING_EMBED_LEN_MAX + 1 - termlen;
279#endif
280}
281
282static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str);
283static VALUE str_new_frozen(VALUE klass, VALUE orig);
284static VALUE str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding);
285static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex);
286static VALUE str_new(VALUE klass, const char *ptr, long len);
287static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
288static inline void str_modifiable(VALUE str);
289static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
290
291static inline void
292str_make_independent(VALUE str)
293{
294 long len = RSTRING_LEN(str);
295 int termlen = TERM_LEN(str);
296 str_make_independent_expand((str), len, 0L, termlen);
297}
298
299static inline int str_dependent_p(VALUE str);
300
301void
302rb_str_make_independent(VALUE str)
303{
304 if (str_dependent_p(str)) {
305 str_make_independent(str);
306 }
307}
308
309void
310rb_str_make_embedded(VALUE str)
311{
312 RUBY_ASSERT(rb_str_reembeddable_p(str));
313 RUBY_ASSERT(!STR_EMBED_P(str));
314
315 char *buf = RSTRING(str)->as.heap.ptr;
316 long len = RSTRING(str)->as.heap.len;
317
318 STR_SET_EMBED(str);
319 STR_SET_EMBED_LEN(str, len);
320
321 if (len > 0) {
322 memcpy(RSTRING_PTR(str), buf, len);
323 ruby_xfree(buf);
324 }
325
326 TERM_FILL(RSTRING(str)->as.embed.ary + len, TERM_LEN(str));
327}
328
329void
330rb_str_update_shared_ary(VALUE str, VALUE old_root, VALUE new_root)
331{
332 // if the root location hasn't changed, we don't need to update
333 if (new_root == old_root) {
334 return;
335 }
336
337 // if the root string isn't embedded, we don't need to touch the ponter.
338 // it already points to the shame shared buffer
339 if (!STR_EMBED_P(new_root)) {
340 return;
341 }
342
343 size_t offset = (size_t)((uintptr_t)RSTRING(str)->as.heap.ptr - (uintptr_t)RSTRING(old_root)->as.embed.ary);
344
345 RUBY_ASSERT(RSTRING(str)->as.heap.ptr >= RSTRING(old_root)->as.embed.ary);
346 RSTRING(str)->as.heap.ptr = RSTRING(new_root)->as.embed.ary + offset;
347}
348
349void
350rb_debug_rstring_null_ptr(const char *func)
351{
352 fprintf(stderr, "%s is returning NULL!! "
353 "SIGSEGV is highly expected to follow immediately.\n"
354 "If you could reproduce, attach your debugger here, "
355 "and look at the passed string.\n",
356 func);
357}
358
359/* symbols for [up|down|swap]case/capitalize options */
360static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
361
362static rb_encoding *
363get_encoding(VALUE str)
364{
365 return rb_enc_from_index(ENCODING_GET(str));
366}
367
368static void
369mustnot_broken(VALUE str)
370{
371 if (is_broken_string(str)) {
372 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
373 }
374}
375
376static void
377mustnot_wchar(VALUE str)
378{
379 rb_encoding *enc = STR_ENC_GET(str);
380 if (rb_enc_mbminlen(enc) > 1) {
381 rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc));
382 }
383}
384
385static int fstring_cmp(VALUE a, VALUE b);
386
387static VALUE register_fstring(VALUE str, bool copy);
388
389const struct st_hash_type rb_fstring_hash_type = {
390 fstring_cmp,
392};
393
394#define BARE_STRING_P(str) (!FL_ANY_RAW(str, FL_EXIVAR) && RBASIC_CLASS(str) == rb_cString)
395
397 VALUE fstr;
398 bool copy;
399};
400
401static int
402fstr_update_callback(st_data_t *key, st_data_t *value, st_data_t data, int existing)
403{
404
405 struct fstr_update_arg *arg = (struct fstr_update_arg *)data;
406 VALUE str = (VALUE)*key;
407
408 if (existing) {
409 /* because of lazy sweep, str may be unmarked already and swept
410 * at next time */
411
412 if (rb_objspace_garbage_object_p(str)) {
413 arg->fstr = Qundef;
414 return ST_DELETE;
415 }
416
417 arg->fstr = str;
418 return ST_STOP;
419 }
420 else {
421 if (FL_TEST_RAW(str, STR_FAKESTR)) {
422 if (arg->copy) {
423 VALUE new_str = str_new(rb_cString, RSTRING(str)->as.heap.ptr, RSTRING(str)->as.heap.len);
424 rb_enc_copy(new_str, str);
425 str = new_str;
426 }
427 else {
428 str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr,
429 RSTRING(str)->as.heap.len,
430 ENCODING_GET(str));
431 }
432 OBJ_FREEZE_RAW(str);
433 }
434 else {
435 if (!OBJ_FROZEN(str))
436 str = str_new_frozen(rb_cString, str);
437 if (STR_SHARED_P(str)) { /* str should not be shared */
438 /* shared substring */
439 str_make_independent(str);
440 assert(OBJ_FROZEN(str));
441 }
442 if (!BARE_STRING_P(str)) {
443 str = str_new_frozen(rb_cString, str);
444 }
445 }
446 RBASIC(str)->flags |= RSTRING_FSTR;
447
448 *key = *value = arg->fstr = str;
449 return ST_CONTINUE;
450 }
451}
452
453RUBY_FUNC_EXPORTED
454VALUE
455rb_fstring(VALUE str)
456{
457 VALUE fstr;
458 int bare;
459
460 Check_Type(str, T_STRING);
461
462 if (FL_TEST(str, RSTRING_FSTR))
463 return str;
464
465 bare = BARE_STRING_P(str);
466 if (!bare) {
467 if (STR_EMBED_P(str)) {
468 OBJ_FREEZE_RAW(str);
469 return str;
470 }
471 if (FL_TEST_RAW(str, STR_NOEMBED|STR_SHARED_ROOT|STR_SHARED) == (STR_NOEMBED|STR_SHARED_ROOT)) {
472 assert(OBJ_FROZEN(str));
473 return str;
474 }
475 }
476
477 if (!OBJ_FROZEN(str))
478 rb_str_resize(str, RSTRING_LEN(str));
479
480 fstr = register_fstring(str, FALSE);
481
482 if (!bare) {
483 str_replace_shared_without_enc(str, fstr);
484 OBJ_FREEZE_RAW(str);
485 return str;
486 }
487 return fstr;
488}
489
490static VALUE
491register_fstring(VALUE str, bool copy)
492{
493 struct fstr_update_arg args;
494 args.copy = copy;
495
496 RB_VM_LOCK_ENTER();
497 {
498 st_table *frozen_strings = rb_vm_fstring_table();
499 do {
500 args.fstr = str;
501 st_update(frozen_strings, (st_data_t)str, fstr_update_callback, (st_data_t)&args);
502 } while (UNDEF_P(args.fstr));
503 }
504 RB_VM_LOCK_LEAVE();
505
506 assert(OBJ_FROZEN(args.fstr));
507 assert(!FL_TEST_RAW(args.fstr, STR_FAKESTR));
508 assert(!FL_TEST_RAW(args.fstr, FL_EXIVAR));
509 assert(RBASIC_CLASS(args.fstr) == rb_cString);
510 return args.fstr;
511}
512
513static VALUE
514setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
515{
516 fake_str->basic.flags = T_STRING|RSTRING_NOEMBED|STR_NOFREE|STR_FAKESTR;
517 /* SHARED to be allocated by the callback */
518
519 if (!name) {
520 RUBY_ASSERT_ALWAYS(len == 0);
521 name = "";
522 }
523
524 ENCODING_SET_INLINED((VALUE)fake_str, encidx);
525
526 RBASIC_SET_CLASS_RAW((VALUE)fake_str, rb_cString);
527 fake_str->as.heap.len = len;
528 fake_str->as.heap.ptr = (char *)name;
529 fake_str->as.heap.aux.capa = len;
530 return (VALUE)fake_str;
531}
532
533/*
534 * set up a fake string which refers a static string literal.
535 */
536VALUE
537rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
538{
539 return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
540}
541
542/*
543 * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
544 * shared string which refers a static string literal. `ptr` must
545 * point a constant string.
546 */
547MJIT_FUNC_EXPORTED VALUE
548rb_fstring_new(const char *ptr, long len)
549{
550 struct RString fake_str;
551 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), FALSE);
552}
553
554VALUE
555rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
556{
557 struct RString fake_str;
558 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), FALSE);
559}
560
561VALUE
562rb_fstring_cstr(const char *ptr)
563{
564 return rb_fstring_new(ptr, strlen(ptr));
565}
566
567static int
568fstring_set_class_i(st_data_t key, st_data_t val, st_data_t arg)
569{
570 RBASIC_SET_CLASS((VALUE)key, (VALUE)arg);
571 return ST_CONTINUE;
572}
573
574static int
575fstring_cmp(VALUE a, VALUE b)
576{
577 long alen, blen;
578 const char *aptr, *bptr;
579 RSTRING_GETMEM(a, aptr, alen);
580 RSTRING_GETMEM(b, bptr, blen);
581 return (alen != blen ||
582 ENCODING_GET(a) != ENCODING_GET(b) ||
583 memcmp(aptr, bptr, alen) != 0);
584}
585
586static inline int
587single_byte_optimizable(VALUE str)
588{
589 rb_encoding *enc;
590
591 /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
593 return 1;
594
595 enc = STR_ENC_GET(str);
596 if (rb_enc_mbmaxlen(enc) == 1)
597 return 1;
598
599 /* Conservative. Possibly single byte.
600 * "\xa1" in Shift_JIS for example. */
601 return 0;
602}
603
605
606static inline const char *
607search_nonascii(const char *p, const char *e)
608{
609 const uintptr_t *s, *t;
610
611#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
612# if SIZEOF_UINTPTR_T == 8
613# define NONASCII_MASK UINT64_C(0x8080808080808080)
614# elif SIZEOF_UINTPTR_T == 4
615# define NONASCII_MASK UINT32_C(0x80808080)
616# else
617# error "don't know what to do."
618# endif
619#else
620# if SIZEOF_UINTPTR_T == 8
621# define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
622# elif SIZEOF_UINTPTR_T == 4
623# define NONASCII_MASK 0x80808080UL /* or...? */
624# else
625# error "don't know what to do."
626# endif
627#endif
628
629 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
630#if !UNALIGNED_WORD_ACCESS
631 if ((uintptr_t)p % SIZEOF_VOIDP) {
632 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
633 p += l;
634 switch (l) {
635 default: UNREACHABLE;
636#if SIZEOF_VOIDP > 4
637 case 7: if (p[-7]&0x80) return p-7;
638 case 6: if (p[-6]&0x80) return p-6;
639 case 5: if (p[-5]&0x80) return p-5;
640 case 4: if (p[-4]&0x80) return p-4;
641#endif
642 case 3: if (p[-3]&0x80) return p-3;
643 case 2: if (p[-2]&0x80) return p-2;
644 case 1: if (p[-1]&0x80) return p-1;
645 case 0: break;
646 }
647 }
648#endif
649#if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
650#define aligned_ptr(value) \
651 __builtin_assume_aligned((value), sizeof(uintptr_t))
652#else
653#define aligned_ptr(value) (uintptr_t *)(value)
654#endif
655 s = aligned_ptr(p);
656 t = (uintptr_t *)(e - (SIZEOF_VOIDP-1));
657#undef aligned_ptr
658 for (;s < t; s++) {
659 if (*s & NONASCII_MASK) {
660#ifdef WORDS_BIGENDIAN
661 return (const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
662#else
663 return (const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
664#endif
665 }
666 }
667 p = (const char *)s;
668 }
669
670 switch (e - p) {
671 default: UNREACHABLE;
672#if SIZEOF_VOIDP > 4
673 case 7: if (e[-7]&0x80) return e-7;
674 case 6: if (e[-6]&0x80) return e-6;
675 case 5: if (e[-5]&0x80) return e-5;
676 case 4: if (e[-4]&0x80) return e-4;
677#endif
678 case 3: if (e[-3]&0x80) return e-3;
679 case 2: if (e[-2]&0x80) return e-2;
680 case 1: if (e[-1]&0x80) return e-1;
681 case 0: return NULL;
682 }
683}
684
685static int
686coderange_scan(const char *p, long len, rb_encoding *enc)
687{
688 const char *e = p + len;
689
690 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
691 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
692 p = search_nonascii(p, e);
694 }
695
696 if (rb_enc_asciicompat(enc)) {
697 p = search_nonascii(p, e);
698 if (!p) return ENC_CODERANGE_7BIT;
699 for (;;) {
700 int ret = rb_enc_precise_mbclen(p, e, enc);
702 p += MBCLEN_CHARFOUND_LEN(ret);
703 if (p == e) break;
704 p = search_nonascii(p, e);
705 if (!p) break;
706 }
707 }
708 else {
709 while (p < e) {
710 int ret = rb_enc_precise_mbclen(p, e, enc);
712 p += MBCLEN_CHARFOUND_LEN(ret);
713 }
714 }
715 return ENC_CODERANGE_VALID;
716}
717
718long
719rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
720{
721 const char *p = s;
722
723 if (*cr == ENC_CODERANGE_BROKEN)
724 return e - s;
725
726 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
727 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
728 if (*cr == ENC_CODERANGE_VALID) return e - s;
729 p = search_nonascii(p, e);
731 return e - s;
732 }
733 else if (rb_enc_asciicompat(enc)) {
734 p = search_nonascii(p, e);
735 if (!p) {
736 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
737 return e - s;
738 }
739 for (;;) {
740 int ret = rb_enc_precise_mbclen(p, e, enc);
741 if (!MBCLEN_CHARFOUND_P(ret)) {
743 return p - s;
744 }
745 p += MBCLEN_CHARFOUND_LEN(ret);
746 if (p == e) break;
747 p = search_nonascii(p, e);
748 if (!p) break;
749 }
750 }
751 else {
752 while (p < e) {
753 int ret = rb_enc_precise_mbclen(p, e, enc);
754 if (!MBCLEN_CHARFOUND_P(ret)) {
756 return p - s;
757 }
758 p += MBCLEN_CHARFOUND_LEN(ret);
759 }
760 }
762 return e - s;
763}
764
765static inline void
766str_enc_copy(VALUE str1, VALUE str2)
767{
768 rb_enc_set_index(str1, ENCODING_GET(str2));
769}
770
771static void
772rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
773{
774 /* this function is designed for copying encoding and coderange
775 * from src to new string "dest" which is made from the part of src.
776 */
777 str_enc_copy(dest, src);
778 if (RSTRING_LEN(dest) == 0) {
779 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
781 else
783 return;
784 }
785 switch (ENC_CODERANGE(src)) {
788 break;
790 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
791 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
793 else
795 break;
796 default:
797 break;
798 }
799}
800
801static void
802rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
803{
804 str_enc_copy(dest, src);
806}
807
808static int
809enc_coderange_scan(VALUE str, rb_encoding *enc)
810{
811 return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
812}
813
814int
815rb_enc_str_coderange_scan(VALUE str, rb_encoding *enc)
816{
817 return enc_coderange_scan(str, enc);
818}
819
820int
822{
823 int cr = ENC_CODERANGE(str);
824
825 if (cr == ENC_CODERANGE_UNKNOWN) {
826 cr = enc_coderange_scan(str, get_encoding(str));
827 ENC_CODERANGE_SET(str, cr);
828 }
829 return cr;
830}
831
832int
834{
835 rb_encoding *enc = STR_ENC_GET(str);
836
837 if (!rb_enc_asciicompat(enc))
838 return FALSE;
839 else if (is_ascii_string(str))
840 return TRUE;
841 return FALSE;
842}
843
844static inline void
845str_mod_check(VALUE s, const char *p, long len)
846{
847 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
848 rb_raise(rb_eRuntimeError, "string modified");
849 }
850}
851
852static size_t
853str_capacity(VALUE str, const int termlen)
854{
855 if (STR_EMBED_P(str)) {
856#if USE_RVARGC
857 return str_embed_capa(str) - termlen;
858#else
859 return (RSTRING_EMBED_LEN_MAX + 1 - termlen);
860#endif
861 }
862 else if (FL_TEST(str, STR_SHARED|STR_NOFREE)) {
863 return RSTRING(str)->as.heap.len;
864 }
865 else {
866 return RSTRING(str)->as.heap.aux.capa;
867 }
868}
869
870size_t
872{
873 return str_capacity(str, TERM_LEN(str));
874}
875
876static inline void
877must_not_null(const char *ptr)
878{
879 if (!ptr) {
880 rb_raise(rb_eArgError, "NULL pointer given");
881 }
882}
883
884static inline VALUE
885str_alloc_embed(VALUE klass, size_t capa)
886{
887 size_t size = rb_str_embed_size(capa);
888 assert(size > 0);
889 assert(rb_gc_size_allocatable_p(size));
890#if !USE_RVARGC
891 assert(size <= sizeof(struct RString));
892#endif
893
894 RVARGC_NEWOBJ_OF(str, struct RString, klass,
896
897 return (VALUE)str;
898}
899
900static inline VALUE
901str_alloc_heap(VALUE klass)
902{
903 RVARGC_NEWOBJ_OF(str, struct RString, klass,
904 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString));
905
906 return (VALUE)str;
907}
908
909static inline VALUE
910empty_str_alloc(VALUE klass)
911{
912 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
913 VALUE str = str_alloc_embed(klass, 0);
914 memset(RSTRING(str)->as.embed.ary, 0, str_embed_capa(str));
915 return str;
916}
917
918static VALUE
919str_new0(VALUE klass, const char *ptr, long len, int termlen)
920{
921 VALUE str;
922
923 if (len < 0) {
924 rb_raise(rb_eArgError, "negative string size (or size too big)");
925 }
926
927 RUBY_DTRACE_CREATE_HOOK(STRING, len);
928
929 if (STR_EMBEDDABLE_P(len, termlen)) {
930 str = str_alloc_embed(klass, len + termlen);
931 if (len == 0) {
933 }
934 }
935 else {
936 str = str_alloc_heap(klass);
937 RSTRING(str)->as.heap.aux.capa = len;
938 /* :FIXME: @shyouhei guesses `len + termlen` is guaranteed to never
939 * integer overflow. If we can STATIC_ASSERT that, the following
940 * mul_add_mul can be reverted to a simple ALLOC_N. */
941 RSTRING(str)->as.heap.ptr =
942 rb_xmalloc_mul_add_mul(sizeof(char), len, sizeof(char), termlen);
943 }
944 if (ptr) {
945 memcpy(RSTRING_PTR(str), ptr, len);
946 }
947 STR_SET_LEN(str, len);
948 TERM_FILL(RSTRING_PTR(str) + len, termlen);
949 return str;
950}
951
952static VALUE
953str_new(VALUE klass, const char *ptr, long len)
954{
955 return str_new0(klass, ptr, len, 1);
956}
957
958VALUE
959rb_str_new(const char *ptr, long len)
960{
961 return str_new(rb_cString, ptr, len);
962}
963
964VALUE
965rb_usascii_str_new(const char *ptr, long len)
966{
967 VALUE str = rb_str_new(ptr, len);
968 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
969 return str;
970}
971
972VALUE
973rb_utf8_str_new(const char *ptr, long len)
974{
975 VALUE str = str_new(rb_cString, ptr, len);
976 rb_enc_associate_index(str, rb_utf8_encindex());
977 return str;
978}
979
980VALUE
981rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
982{
983 VALUE str;
984
985 if (!enc) return rb_str_new(ptr, len);
986
987 str = str_new0(rb_cString, ptr, len, rb_enc_mbminlen(enc));
988 rb_enc_associate(str, enc);
989 return str;
990}
991
992VALUE
994{
995 must_not_null(ptr);
996 /* rb_str_new_cstr() can take pointer from non-malloc-generated
997 * memory regions, and that cannot be detected by the MSAN. Just
998 * trust the programmer that the argument passed here is a sane C
999 * string. */
1000 __msan_unpoison_string(ptr);
1001 return rb_str_new(ptr, strlen(ptr));
1002}
1003
1004VALUE
1006{
1007 VALUE str = rb_str_new_cstr(ptr);
1008 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
1009 return str;
1010}
1011
1012VALUE
1014{
1015 VALUE str = rb_str_new_cstr(ptr);
1016 rb_enc_associate_index(str, rb_utf8_encindex());
1017 return str;
1018}
1019
1020VALUE
1021rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
1022{
1023 must_not_null(ptr);
1024 if (rb_enc_mbminlen(enc) != 1) {
1025 rb_raise(rb_eArgError, "wchar encoding given");
1026 }
1027 return rb_enc_str_new(ptr, strlen(ptr), enc);
1028}
1029
1030static VALUE
1031str_new_static(VALUE klass, const char *ptr, long len, int encindex)
1032{
1033 VALUE str;
1034
1035 if (len < 0) {
1036 rb_raise(rb_eArgError, "negative string size (or size too big)");
1037 }
1038
1039 if (!ptr) {
1040 rb_encoding *enc = rb_enc_get_from_index(encindex);
1041 str = str_new0(klass, ptr, len, rb_enc_mbminlen(enc));
1042 }
1043 else {
1044 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1045 str = str_alloc_heap(klass);
1046 RSTRING(str)->as.heap.len = len;
1047 RSTRING(str)->as.heap.ptr = (char *)ptr;
1048 RSTRING(str)->as.heap.aux.capa = len;
1049 RBASIC(str)->flags |= STR_NOFREE;
1050 }
1051 rb_enc_associate_index(str, encindex);
1052 return str;
1053}
1054
1055VALUE
1056rb_str_new_static(const char *ptr, long len)
1057{
1058 return str_new_static(rb_cString, ptr, len, 0);
1059}
1060
1061VALUE
1063{
1064 return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII);
1065}
1066
1067VALUE
1069{
1070 return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8);
1071}
1072
1073VALUE
1075{
1076 return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc));
1077}
1078
1079static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1080 rb_encoding *from, rb_encoding *to,
1081 int ecflags, VALUE ecopts);
1082
1083static inline bool
1084is_enc_ascii_string(VALUE str, rb_encoding *enc)
1085{
1086 int encidx = rb_enc_to_index(enc);
1087 if (rb_enc_get_index(str) == encidx)
1088 return is_ascii_string(str);
1089 return enc_coderange_scan(str, enc) == ENC_CODERANGE_7BIT;
1090}
1091
1092VALUE
1093rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
1094{
1095 long len;
1096 const char *ptr;
1097 VALUE newstr;
1098
1099 if (!to) return str;
1100 if (!from) from = rb_enc_get(str);
1101 if (from == to) return str;
1102 if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1103 rb_is_ascii8bit_enc(to)) {
1104 if (STR_ENC_GET(str) != to) {
1105 str = rb_str_dup(str);
1106 rb_enc_associate(str, to);
1107 }
1108 return str;
1109 }
1110
1111 RSTRING_GETMEM(str, ptr, len);
1112 newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len,
1113 from, to, ecflags, ecopts);
1114 if (NIL_P(newstr)) {
1115 /* some error, return original */
1116 return str;
1117 }
1118 return newstr;
1119}
1120
1121VALUE
1122rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1123 rb_encoding *from, int ecflags, VALUE ecopts)
1124{
1125 long olen;
1126
1127 olen = RSTRING_LEN(newstr);
1128 if (ofs < -olen || olen < ofs)
1129 rb_raise(rb_eIndexError, "index %ld out of string", ofs);
1130 if (ofs < 0) ofs += olen;
1131 if (!from) {
1132 STR_SET_LEN(newstr, ofs);
1133 return rb_str_cat(newstr, ptr, len);
1134 }
1135
1136 rb_str_modify(newstr);
1137 return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from,
1138 rb_enc_get(newstr),
1139 ecflags, ecopts);
1140}
1141
1142VALUE
1143rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
1144{
1145 STR_SET_LEN(str, 0);
1146 rb_enc_associate(str, enc);
1147 rb_str_cat(str, ptr, len);
1148 return str;
1149}
1150
1151static VALUE
1152str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1153 rb_encoding *from, rb_encoding *to,
1154 int ecflags, VALUE ecopts)
1155{
1156 rb_econv_t *ec;
1158 long olen;
1159 VALUE econv_wrapper;
1160 const unsigned char *start, *sp;
1161 unsigned char *dest, *dp;
1162 size_t converted_output = (size_t)ofs;
1163
1164 olen = rb_str_capacity(newstr);
1165
1166 econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
1167 RBASIC_CLEAR_CLASS(econv_wrapper);
1168 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
1169 if (!ec) return Qnil;
1170 DATA_PTR(econv_wrapper) = ec;
1171
1172 sp = (unsigned char*)ptr;
1173 start = sp;
1174 while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
1175 (dp = dest + converted_output),
1176 (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
1178 /* destination buffer short */
1179 size_t converted_input = sp - start;
1180 size_t rest = len - converted_input;
1181 converted_output = dp - dest;
1182 rb_str_set_len(newstr, converted_output);
1183 if (converted_input && converted_output &&
1184 rest < (LONG_MAX / converted_output)) {
1185 rest = (rest * converted_output) / converted_input;
1186 }
1187 else {
1188 rest = olen;
1189 }
1190 olen += rest < 2 ? 2 : rest;
1191 rb_str_resize(newstr, olen);
1192 }
1193 DATA_PTR(econv_wrapper) = 0;
1194 rb_econv_close(ec);
1195 switch (ret) {
1196 case econv_finished:
1197 len = dp - (unsigned char*)RSTRING_PTR(newstr);
1198 rb_str_set_len(newstr, len);
1199 rb_enc_associate(newstr, to);
1200 return newstr;
1201
1202 default:
1203 return Qnil;
1204 }
1205}
1206
1207VALUE
1209{
1210 return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
1211}
1212
1213VALUE
1215{
1216 rb_encoding *ienc;
1217 VALUE str;
1218 const int eidx = rb_enc_to_index(eenc);
1219
1220 if (!ptr) {
1221 return rb_enc_str_new(ptr, len, eenc);
1222 }
1223
1224 /* ASCII-8BIT case, no conversion */
1225 if ((eidx == rb_ascii8bit_encindex()) ||
1226 (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr + len))) {
1227 return rb_str_new(ptr, len);
1228 }
1229 /* no default_internal or same encoding, no conversion */
1230 ienc = rb_default_internal_encoding();
1231 if (!ienc || eenc == ienc) {
1232 return rb_enc_str_new(ptr, len, eenc);
1233 }
1234 /* ASCII compatible, and ASCII only string, no conversion in
1235 * default_internal */
1236 if ((eidx == rb_ascii8bit_encindex()) ||
1237 (eidx == rb_usascii_encindex()) ||
1238 (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr + len))) {
1239 return rb_enc_str_new(ptr, len, ienc);
1240 }
1241 /* convert from the given encoding to default_internal */
1242 str = rb_enc_str_new(NULL, 0, ienc);
1243 /* when the conversion failed for some reason, just ignore the
1244 * default_internal and result in the given encoding as-is. */
1245 if (NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr, len, eenc, 0, Qnil))) {
1246 rb_str_initialize(str, ptr, len, eenc);
1247 }
1248 return str;
1249}
1250
1251VALUE
1252rb_external_str_with_enc(VALUE str, rb_encoding *eenc)
1253{
1254 int eidx = rb_enc_to_index(eenc);
1255 if (eidx == rb_usascii_encindex() &&
1256 !is_ascii_string(str)) {
1257 rb_enc_associate_index(str, rb_ascii8bit_encindex());
1258 return str;
1259 }
1260 rb_enc_associate_index(str, eidx);
1261 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
1262}
1263
1264VALUE
1265rb_external_str_new(const char *ptr, long len)
1266{
1267 return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
1268}
1269
1270VALUE
1272{
1273 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
1274}
1275
1276VALUE
1277rb_locale_str_new(const char *ptr, long len)
1278{
1279 return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
1280}
1281
1282VALUE
1284{
1285 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
1286}
1287
1288VALUE
1290{
1291 return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
1292}
1293
1294VALUE
1296{
1297 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
1298}
1299
1300VALUE
1302{
1303 return rb_str_export_to_enc(str, rb_default_external_encoding());
1304}
1305
1306VALUE
1308{
1309 return rb_str_export_to_enc(str, rb_locale_encoding());
1310}
1311
1312VALUE
1314{
1315 return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
1316}
1317
1318static VALUE
1319str_replace_shared_without_enc(VALUE str2, VALUE str)
1320{
1321 const int termlen = TERM_LEN(str);
1322 char *ptr;
1323 long len;
1324
1325 RSTRING_GETMEM(str, ptr, len);
1326 if (str_embed_capa(str2) >= len + termlen) {
1327 char *ptr2 = RSTRING(str2)->as.embed.ary;
1328 STR_SET_EMBED(str2);
1329 memcpy(ptr2, RSTRING_PTR(str), len);
1330 STR_SET_EMBED_LEN(str2, len);
1331 TERM_FILL(ptr2+len, termlen);
1332 }
1333 else {
1334 VALUE root;
1335 if (STR_SHARED_P(str)) {
1336 root = RSTRING(str)->as.heap.aux.shared;
1337 RSTRING_GETMEM(str, ptr, len);
1338 }
1339 else {
1340 root = rb_str_new_frozen(str);
1341 RSTRING_GETMEM(root, ptr, len);
1342 }
1343 assert(OBJ_FROZEN(root));
1344 if (!STR_EMBED_P(str2) && !FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1345 if (FL_TEST_RAW(str2, STR_SHARED_ROOT)) {
1346 rb_fatal("about to free a possible shared root");
1347 }
1348 char *ptr2 = STR_HEAP_PTR(str2);
1349 if (ptr2 != ptr) {
1350 ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1351 }
1352 }
1353 FL_SET(str2, STR_NOEMBED);
1354 RSTRING(str2)->as.heap.len = len;
1355 RSTRING(str2)->as.heap.ptr = ptr;
1356 STR_SET_SHARED(str2, root);
1357 }
1358 return str2;
1359}
1360
1361static VALUE
1362str_replace_shared(VALUE str2, VALUE str)
1363{
1364 str_replace_shared_without_enc(str2, str);
1365 rb_enc_cr_str_exact_copy(str2, str);
1366 return str2;
1367}
1368
1369static VALUE
1370str_new_shared(VALUE klass, VALUE str)
1371{
1372 return str_replace_shared(str_alloc_heap(klass), str);
1373}
1374
1375VALUE
1377{
1378 return str_new_shared(rb_obj_class(str), str);
1379}
1380
1381VALUE
1383{
1384 if (OBJ_FROZEN(orig)) return orig;
1385 return str_new_frozen(rb_obj_class(orig), orig);
1386}
1387
1388static VALUE
1389rb_str_new_frozen_String(VALUE orig)
1390{
1391 if (OBJ_FROZEN(orig) && rb_obj_class(orig) == rb_cString) return orig;
1392 return str_new_frozen(rb_cString, orig);
1393}
1394
1395VALUE
1396rb_str_tmp_frozen_acquire(VALUE orig)
1397{
1398 if (OBJ_FROZEN_RAW(orig)) return orig;
1399 return str_new_frozen_buffer(0, orig, FALSE);
1400}
1401
1402void
1403rb_str_tmp_frozen_release(VALUE orig, VALUE tmp)
1404{
1405 if (RBASIC_CLASS(tmp) != 0)
1406 return;
1407
1408 if (STR_EMBED_P(tmp)) {
1409 assert(OBJ_FROZEN_RAW(tmp));
1410 }
1411 else if (FL_TEST_RAW(orig, STR_SHARED) &&
1412 !FL_TEST_RAW(orig, STR_TMPLOCK|RUBY_FL_FREEZE)) {
1413 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1414
1415 if (shared == tmp && !FL_TEST_RAW(tmp, STR_BORROWED)) {
1416 assert(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr);
1417 assert(RSTRING(orig)->as.heap.len == RSTRING(tmp)->as.heap.len);
1418
1419 /* Unshare orig since the root (tmp) only has this one child. */
1420 FL_UNSET_RAW(orig, STR_SHARED);
1421 RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa;
1422 RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE;
1423 assert(OBJ_FROZEN_RAW(tmp));
1424
1425 /* Make tmp embedded and empty so it is safe for sweeping. */
1426 STR_SET_EMBED(tmp);
1427 STR_SET_EMBED_LEN(tmp, 0);
1428 }
1429 }
1430}
1431
1432static VALUE
1433str_new_frozen(VALUE klass, VALUE orig)
1434{
1435 return str_new_frozen_buffer(klass, orig, TRUE);
1436}
1437
1438static VALUE
1439heap_str_make_shared(VALUE klass, VALUE orig)
1440{
1441 assert(!STR_EMBED_P(orig));
1442 assert(!STR_SHARED_P(orig));
1443
1444 VALUE str = str_alloc_heap(klass);
1445 RSTRING(str)->as.heap.len = RSTRING_LEN(orig);
1446 RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1447 RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa;
1448 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1449 RBASIC(orig)->flags &= ~STR_NOFREE;
1450 STR_SET_SHARED(orig, str);
1451 if (klass == 0)
1452 FL_UNSET_RAW(str, STR_BORROWED);
1453 return str;
1454}
1455
1456static VALUE
1457str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
1458{
1459 VALUE str;
1460
1461 long len = RSTRING_LEN(orig);
1462 int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1463
1464 if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(len, termlen)) {
1465 str = str_new0(klass, RSTRING_PTR(orig), len, termlen);
1466 assert(STR_EMBED_P(str));
1467 }
1468 else {
1469 if (FL_TEST_RAW(orig, STR_SHARED)) {
1470 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1471 long ofs = RSTRING(orig)->as.heap.ptr - RSTRING_PTR(shared);
1472 long rest = RSTRING_LEN(shared) - ofs - RSTRING(orig)->as.heap.len;
1473 assert(ofs >= 0);
1474 assert(rest >= 0);
1475 assert(ofs + rest <= RSTRING_LEN(shared));
1476#if !USE_RVARGC
1477 assert(!STR_EMBED_P(shared));
1478#endif
1479 assert(OBJ_FROZEN(shared));
1480
1481 if ((ofs > 0) || (rest > 0) ||
1482 (klass != RBASIC(shared)->klass) ||
1483 ENCODING_GET(shared) != ENCODING_GET(orig)) {
1484 str = str_new_shared(klass, shared);
1485 assert(!STR_EMBED_P(str));
1486 RSTRING(str)->as.heap.ptr += ofs;
1487 RSTRING(str)->as.heap.len -= ofs + rest;
1488 }
1489 else {
1490 if (RBASIC_CLASS(shared) == 0)
1491 FL_SET_RAW(shared, STR_BORROWED);
1492 return shared;
1493 }
1494 }
1495 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1496 str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1497 STR_SET_EMBED(str);
1498 memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1499 STR_SET_EMBED_LEN(str, RSTRING_LEN(orig));
1500 TERM_FILL(RSTRING_END(str), TERM_LEN(orig));
1501 }
1502 else {
1503 str = heap_str_make_shared(klass, orig);
1504 }
1505 }
1506
1507 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1508 OBJ_FREEZE(str);
1509 return str;
1510}
1511
1512VALUE
1513rb_str_new_with_class(VALUE obj, const char *ptr, long len)
1514{
1515 return str_new0(rb_obj_class(obj), ptr, len, TERM_LEN(obj));
1516}
1517
1518static VALUE
1519str_new_empty_String(VALUE str)
1520{
1521 VALUE v = rb_str_new(0, 0);
1522 rb_enc_copy(v, str);
1523 return v;
1524}
1525
1526#define STR_BUF_MIN_SIZE 63
1527#if !USE_RVARGC
1528STATIC_ASSERT(STR_BUF_MIN_SIZE, STR_BUF_MIN_SIZE > RSTRING_EMBED_LEN_MAX);
1529#endif
1530
1531VALUE
1533{
1534 if (STR_EMBEDDABLE_P(capa, 1)) {
1535 return str_alloc_embed(rb_cString, capa + 1);
1536 }
1537
1538 VALUE str = str_alloc_heap(rb_cString);
1539
1540#if !USE_RVARGC
1541 if (capa < STR_BUF_MIN_SIZE) {
1542 capa = STR_BUF_MIN_SIZE;
1543 }
1544#endif
1545 RSTRING(str)->as.heap.aux.capa = capa;
1546 RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + 1);
1547 RSTRING(str)->as.heap.ptr[0] = '\0';
1548
1549 return str;
1550}
1551
1552VALUE
1554{
1555 VALUE str;
1556 long len = strlen(ptr);
1557
1558 str = rb_str_buf_new(len);
1559 rb_str_buf_cat(str, ptr, len);
1560
1561 return str;
1562}
1563
1564VALUE
1566{
1567 return str_new(0, 0, len);
1568}
1569
1570void
1572{
1573 if (FL_TEST(str, RSTRING_FSTR)) {
1574 st_data_t fstr = (st_data_t)str;
1575
1576 RB_VM_LOCK_ENTER();
1577 {
1578 st_delete(rb_vm_fstring_table(), &fstr, NULL);
1579 RB_DEBUG_COUNTER_INC(obj_str_fstr);
1580 }
1581 RB_VM_LOCK_LEAVE();
1582 }
1583
1584 if (STR_EMBED_P(str)) {
1585 RB_DEBUG_COUNTER_INC(obj_str_embed);
1586 }
1587 else if (FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1588 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_SHARED));
1589 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_NOFREE));
1590 }
1591 else {
1592 RB_DEBUG_COUNTER_INC(obj_str_ptr);
1593 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1594 }
1595}
1596
1597RUBY_FUNC_EXPORTED size_t
1598rb_str_memsize(VALUE str)
1599{
1600 if (FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1601 return STR_HEAP_SIZE(str);
1602 }
1603 else {
1604 return 0;
1605 }
1606}
1607
1608VALUE
1610{
1611 return rb_convert_type_with_id(str, T_STRING, "String", idTo_str);
1612}
1613
1614static inline void str_discard(VALUE str);
1615static void str_shared_replace(VALUE str, VALUE str2);
1616
1617void
1619{
1620 if (str != str2) str_shared_replace(str, str2);
1621}
1622
1623static void
1624str_shared_replace(VALUE str, VALUE str2)
1625{
1626 rb_encoding *enc;
1627 int cr;
1628 int termlen;
1629
1630 RUBY_ASSERT(str2 != str);
1631 enc = STR_ENC_GET(str2);
1632 cr = ENC_CODERANGE(str2);
1633 str_discard(str);
1634 termlen = rb_enc_mbminlen(enc);
1635
1636 if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1637 STR_SET_EMBED(str);
1638 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen);
1639 STR_SET_EMBED_LEN(str, RSTRING_LEN(str2));
1640 rb_enc_associate(str, enc);
1641 ENC_CODERANGE_SET(str, cr);
1642 }
1643 else {
1644#if USE_RVARGC
1645 if (STR_EMBED_P(str2)) {
1646 assert(!FL_TEST(str2, STR_SHARED));
1647 long len = RSTRING(str2)->as.embed.len;
1648 assert(len + termlen <= str_embed_capa(str2));
1649
1650 char *new_ptr = ALLOC_N(char, len + termlen);
1651 memcpy(new_ptr, RSTRING(str2)->as.embed.ary, len + termlen);
1652 RSTRING(str2)->as.heap.ptr = new_ptr;
1653 RSTRING(str2)->as.heap.len = len;
1654 RSTRING(str2)->as.heap.aux.capa = len;
1655 STR_SET_NOEMBED(str2);
1656 }
1657#endif
1658
1659 STR_SET_NOEMBED(str);
1660 FL_UNSET(str, STR_SHARED);
1661 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1662 RSTRING(str)->as.heap.len = RSTRING_LEN(str2);
1663
1664 if (FL_TEST(str2, STR_SHARED)) {
1665 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1666 STR_SET_SHARED(str, shared);
1667 }
1668 else {
1669 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
1670 }
1671
1672 /* abandon str2 */
1673 STR_SET_EMBED(str2);
1674 RSTRING_PTR(str2)[0] = 0;
1675 STR_SET_EMBED_LEN(str2, 0);
1676 rb_enc_associate(str, enc);
1677 ENC_CODERANGE_SET(str, cr);
1678 }
1679}
1680
1681VALUE
1683{
1684 VALUE str;
1685
1686 if (RB_TYPE_P(obj, T_STRING)) {
1687 return obj;
1688 }
1689 str = rb_funcall(obj, idTo_s, 0);
1690 return rb_obj_as_string_result(str, obj);
1691}
1692
1693MJIT_FUNC_EXPORTED VALUE
1694rb_obj_as_string_result(VALUE str, VALUE obj)
1695{
1696 if (!RB_TYPE_P(str, T_STRING))
1697 return rb_any_to_s(obj);
1698 return str;
1699}
1700
1701static VALUE
1702str_replace(VALUE str, VALUE str2)
1703{
1704 long len;
1705
1706 len = RSTRING_LEN(str2);
1707 if (STR_SHARED_P(str2)) {
1708 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1709 assert(OBJ_FROZEN(shared));
1710 STR_SET_NOEMBED(str);
1711 RSTRING(str)->as.heap.len = len;
1712 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1713 STR_SET_SHARED(str, shared);
1714 rb_enc_cr_str_exact_copy(str, str2);
1715 }
1716 else {
1717 str_replace_shared(str, str2);
1718 }
1719
1720 return str;
1721}
1722
1723static inline VALUE
1724ec_str_alloc_embed(struct rb_execution_context_struct *ec, VALUE klass, size_t capa)
1725{
1726 size_t size = rb_str_embed_size(capa);
1727 assert(size > 0);
1728 assert(rb_gc_size_allocatable_p(size));
1729#if !USE_RVARGC
1730 assert(size <= sizeof(struct RString));
1731#endif
1732
1733 RB_RVARGC_EC_NEWOBJ_OF(ec, str, struct RString, klass,
1735
1736 return (VALUE)str;
1737}
1738
1739static inline VALUE
1740ec_str_alloc_heap(struct rb_execution_context_struct *ec, VALUE klass)
1741{
1742 RB_RVARGC_EC_NEWOBJ_OF(ec, str, struct RString, klass,
1743 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString));
1744
1745 return (VALUE)str;
1746}
1747
1748static inline VALUE
1749str_duplicate_setup(VALUE klass, VALUE str, VALUE dup)
1750{
1751 const VALUE flag_mask =
1752#if !USE_RVARGC
1753 RSTRING_NOEMBED | RSTRING_EMBED_LEN_MASK |
1754#endif
1756 FL_FREEZE
1757 ;
1758 VALUE flags = FL_TEST_RAW(str, flag_mask);
1759 int encidx = 0;
1760 if (STR_EMBED_P(str)) {
1761 long len = RSTRING_EMBED_LEN(str);
1762
1763 assert(STR_EMBED_P(dup));
1764 assert(str_embed_capa(dup) >= len + 1);
1765 STR_SET_EMBED_LEN(dup, len);
1766 MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(str)->as.embed.ary, char, len + 1);
1767 }
1768 else {
1769 VALUE root = str;
1770 if (FL_TEST_RAW(str, STR_SHARED)) {
1771 root = RSTRING(str)->as.heap.aux.shared;
1772 }
1773 else if (UNLIKELY(!(flags & FL_FREEZE))) {
1774 root = str = str_new_frozen(klass, str);
1775 flags = FL_TEST_RAW(str, flag_mask);
1776 }
1777 assert(!STR_SHARED_P(root));
1778 assert(RB_OBJ_FROZEN_RAW(root));
1779 if (0) {}
1780#if !USE_RVARGC
1781 else if (STR_EMBED_P(root)) {
1782 MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(root)->as.embed.ary,
1783 char, RSTRING_EMBED_LEN_MAX + 1);
1784 FL_UNSET(dup, STR_NOEMBED);
1785 }
1786#endif
1787 else {
1788 RSTRING(dup)->as.heap.len = RSTRING_LEN(str);
1789 RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1790 FL_SET(root, STR_SHARED_ROOT);
1791 RB_OBJ_WRITE(dup, &RSTRING(dup)->as.heap.aux.shared, root);
1792 flags |= RSTRING_NOEMBED | STR_SHARED;
1793 }
1794 }
1795
1796 if ((flags & ENCODING_MASK) == (ENCODING_INLINE_MAX<<ENCODING_SHIFT)) {
1797 encidx = rb_enc_get_index(str);
1798 flags &= ~ENCODING_MASK;
1799 }
1800 FL_SET_RAW(dup, flags & ~FL_FREEZE);
1801 if (encidx) rb_enc_associate_index(dup, encidx);
1802 return dup;
1803}
1804
1805static inline VALUE
1806ec_str_duplicate(struct rb_execution_context_struct *ec, VALUE klass, VALUE str)
1807{
1808 VALUE dup;
1809 if (FL_TEST(str, STR_NOEMBED)) {
1810 dup = ec_str_alloc_heap(ec, klass);
1811 }
1812 else {
1813 dup = ec_str_alloc_embed(ec, klass, RSTRING_EMBED_LEN(str) + TERM_LEN(str));
1814 }
1815
1816 return str_duplicate_setup(klass, str, dup);
1817}
1818
1819static inline VALUE
1820str_duplicate(VALUE klass, VALUE str)
1821{
1822 VALUE dup;
1823 if (FL_TEST(str, STR_NOEMBED)) {
1824 dup = str_alloc_heap(klass);
1825 }
1826 else {
1827 dup = str_alloc_embed(klass, RSTRING_EMBED_LEN(str) + TERM_LEN(str));
1828 }
1829
1830 return str_duplicate_setup(klass, str, dup);
1831}
1832
1833VALUE
1835{
1836 return str_duplicate(rb_obj_class(str), str);
1837}
1838
1839VALUE
1841{
1842 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1843 return str_duplicate(rb_cString, str);
1844}
1845
1846VALUE
1847rb_ec_str_resurrect(struct rb_execution_context_struct *ec, VALUE str)
1848{
1849 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1850 return ec_str_duplicate(ec, rb_cString, str);
1851}
1852
1853/*
1854 *
1855 * call-seq:
1856 * String.new(string = '', **opts) -> new_string
1857 *
1858 * :include: doc/string/new.rdoc
1859 *
1860 */
1861
1862static VALUE
1863rb_str_init(int argc, VALUE *argv, VALUE str)
1864{
1865 static ID keyword_ids[2];
1866 VALUE orig, opt, venc, vcapa;
1867 VALUE kwargs[2];
1868 rb_encoding *enc = 0;
1869 int n;
1870
1871 if (!keyword_ids[0]) {
1872 keyword_ids[0] = rb_id_encoding();
1873 CONST_ID(keyword_ids[1], "capacity");
1874 }
1875
1876 n = rb_scan_args(argc, argv, "01:", &orig, &opt);
1877 if (!NIL_P(opt)) {
1878 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
1879 venc = kwargs[0];
1880 vcapa = kwargs[1];
1881 if (!UNDEF_P(venc) && !NIL_P(venc)) {
1882 enc = rb_to_encoding(venc);
1883 }
1884 if (!UNDEF_P(vcapa) && !NIL_P(vcapa)) {
1885 long capa = NUM2LONG(vcapa);
1886 long len = 0;
1887 int termlen = enc ? rb_enc_mbminlen(enc) : 1;
1888
1889 if (capa < STR_BUF_MIN_SIZE) {
1890 capa = STR_BUF_MIN_SIZE;
1891 }
1892 if (n == 1) {
1893 StringValue(orig);
1894 len = RSTRING_LEN(orig);
1895 if (capa < len) {
1896 capa = len;
1897 }
1898 if (orig == str) n = 0;
1899 }
1900 str_modifiable(str);
1901 if (STR_EMBED_P(str)) { /* make noembed always */
1902 char *new_ptr = ALLOC_N(char, (size_t)capa + termlen);
1903#if USE_RVARGC
1904 assert(RSTRING(str)->as.embed.len + 1 <= str_embed_capa(str));
1905 memcpy(new_ptr, RSTRING(str)->as.embed.ary, RSTRING(str)->as.embed.len + 1);
1906#else
1907 memcpy(new_ptr, RSTRING(str)->as.embed.ary, RSTRING_EMBED_LEN_MAX + 1);
1908#endif
1909 RSTRING(str)->as.heap.ptr = new_ptr;
1910 }
1911 else if (FL_TEST(str, STR_SHARED|STR_NOFREE)) {
1912 const size_t size = (size_t)capa + termlen;
1913 const char *const old_ptr = RSTRING_PTR(str);
1914 const size_t osize = RSTRING(str)->as.heap.len + TERM_LEN(str);
1915 char *new_ptr = ALLOC_N(char, (size_t)capa + termlen);
1916 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
1917 FL_UNSET_RAW(str, STR_SHARED|STR_NOFREE);
1918 RSTRING(str)->as.heap.ptr = new_ptr;
1919 }
1920 else if (STR_HEAP_SIZE(str) != (size_t)capa + termlen) {
1921 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
1922 (size_t)capa + termlen, STR_HEAP_SIZE(str));
1923 }
1924 RSTRING(str)->as.heap.len = len;
1925 TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen);
1926 if (n == 1) {
1927 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len);
1928 rb_enc_cr_str_exact_copy(str, orig);
1929 }
1930 FL_SET(str, STR_NOEMBED);
1931 RSTRING(str)->as.heap.aux.capa = capa;
1932 }
1933 else if (n == 1) {
1934 rb_str_replace(str, orig);
1935 }
1936 if (enc) {
1937 rb_enc_associate(str, enc);
1939 }
1940 }
1941 else if (n == 1) {
1942 rb_str_replace(str, orig);
1943 }
1944 return str;
1945}
1946
1947#ifdef NONASCII_MASK
1948#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
1949
1950/*
1951 * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
1952 * bit representation. (see https://en.wikipedia.org/wiki/UTF-8)
1953 * Therefore, the following pseudocode can detect UTF-8 leading bytes.
1954 *
1955 * if (!(byte & 0x80))
1956 * byte |= 0x40; // turn on bit6
1957 * return ((byte>>6) & 1); // bit6 represent whether this byte is leading or not.
1958 *
1959 * This function calculates whether a byte is leading or not for all bytes
1960 * in the argument word by concurrently using the above logic, and then
1961 * adds up the number of leading bytes in the word.
1962 */
1963static inline uintptr_t
1964count_utf8_lead_bytes_with_word(const uintptr_t *s)
1965{
1966 uintptr_t d = *s;
1967
1968 /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
1969 d = (d>>6) | (~d>>7);
1970 d &= NONASCII_MASK >> 7;
1971
1972 /* Gather all bytes. */
1973#if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
1974 /* use only if it can use POPCNT */
1975 return rb_popcount_intptr(d);
1976#else
1977 d += (d>>8);
1978 d += (d>>16);
1979# if SIZEOF_VOIDP == 8
1980 d += (d>>32);
1981# endif
1982 return (d&0xF);
1983#endif
1984}
1985#endif
1986
1987static inline long
1988enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
1989{
1990 long c;
1991 const char *q;
1992
1993 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
1994 long diff = (long)(e - p);
1995 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
1996 }
1997#ifdef NONASCII_MASK
1998 else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
1999 uintptr_t len = 0;
2000 if ((int)sizeof(uintptr_t) * 2 < e - p) {
2001 const uintptr_t *s, *t;
2002 const uintptr_t lowbits = sizeof(uintptr_t) - 1;
2003 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2004 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2005 while (p < (const char *)s) {
2006 if (is_utf8_lead_byte(*p)) len++;
2007 p++;
2008 }
2009 while (s < t) {
2010 len += count_utf8_lead_bytes_with_word(s);
2011 s++;
2012 }
2013 p = (const char *)s;
2014 }
2015 while (p < e) {
2016 if (is_utf8_lead_byte(*p)) len++;
2017 p++;
2018 }
2019 return (long)len;
2020 }
2021#endif
2022 else if (rb_enc_asciicompat(enc)) {
2023 c = 0;
2024 if (ENC_CODERANGE_CLEAN_P(cr)) {
2025 while (p < e) {
2026 if (ISASCII(*p)) {
2027 q = search_nonascii(p, e);
2028 if (!q)
2029 return c + (e - p);
2030 c += q - p;
2031 p = q;
2032 }
2033 p += rb_enc_fast_mbclen(p, e, enc);
2034 c++;
2035 }
2036 }
2037 else {
2038 while (p < e) {
2039 if (ISASCII(*p)) {
2040 q = search_nonascii(p, e);
2041 if (!q)
2042 return c + (e - p);
2043 c += q - p;
2044 p = q;
2045 }
2046 p += rb_enc_mbclen(p, e, enc);
2047 c++;
2048 }
2049 }
2050 return c;
2051 }
2052
2053 for (c=0; p<e; c++) {
2054 p += rb_enc_mbclen(p, e, enc);
2055 }
2056 return c;
2057}
2058
2059long
2060rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
2061{
2062 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
2063}
2064
2065/* To get strlen with cr
2066 * Note that given cr is not used.
2067 */
2068long
2069rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
2070{
2071 long c;
2072 const char *q;
2073 int ret;
2074
2075 *cr = 0;
2076 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2077 long diff = (long)(e - p);
2078 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2079 }
2080 else if (rb_enc_asciicompat(enc)) {
2081 c = 0;
2082 while (p < e) {
2083 if (ISASCII(*p)) {
2084 q = search_nonascii(p, e);
2085 if (!q) {
2086 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2087 return c + (e - p);
2088 }
2089 c += q - p;
2090 p = q;
2091 }
2092 ret = rb_enc_precise_mbclen(p, e, enc);
2093 if (MBCLEN_CHARFOUND_P(ret)) {
2094 *cr |= ENC_CODERANGE_VALID;
2095 p += MBCLEN_CHARFOUND_LEN(ret);
2096 }
2097 else {
2099 p++;
2100 }
2101 c++;
2102 }
2103 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2104 return c;
2105 }
2106
2107 for (c=0; p<e; c++) {
2108 ret = rb_enc_precise_mbclen(p, e, enc);
2109 if (MBCLEN_CHARFOUND_P(ret)) {
2110 *cr |= ENC_CODERANGE_VALID;
2111 p += MBCLEN_CHARFOUND_LEN(ret);
2112 }
2113 else {
2115 if (p + rb_enc_mbminlen(enc) <= e)
2116 p += rb_enc_mbminlen(enc);
2117 else
2118 p = e;
2119 }
2120 }
2121 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2122 return c;
2123}
2124
2125/* enc must be str's enc or rb_enc_check(str, str2) */
2126static long
2127str_strlen(VALUE str, rb_encoding *enc)
2128{
2129 const char *p, *e;
2130 int cr;
2131
2132 if (single_byte_optimizable(str)) return RSTRING_LEN(str);
2133 if (!enc) enc = STR_ENC_GET(str);
2134 p = RSTRING_PTR(str);
2135 e = RSTRING_END(str);
2136 cr = ENC_CODERANGE(str);
2137
2138 if (cr == ENC_CODERANGE_UNKNOWN) {
2139 long n = rb_enc_strlen_cr(p, e, enc, &cr);
2140 if (cr) ENC_CODERANGE_SET(str, cr);
2141 return n;
2142 }
2143 else {
2144 return enc_strlen(p, e, enc, cr);
2145 }
2146}
2147
2148long
2150{
2151 return str_strlen(str, NULL);
2152}
2153
2154/*
2155 * call-seq:
2156 * length -> integer
2157 *
2158 * :include: doc/string/length.rdoc
2159 *
2160 */
2161
2162VALUE
2164{
2165 return LONG2NUM(str_strlen(str, NULL));
2166}
2167
2168/*
2169 * call-seq:
2170 * bytesize -> integer
2171 *
2172 * :include: doc/string/bytesize.rdoc
2173 *
2174 */
2175
2176static VALUE
2177rb_str_bytesize(VALUE str)
2178{
2179 return LONG2NUM(RSTRING_LEN(str));
2180}
2181
2182/*
2183 * call-seq:
2184 * empty? -> true or false
2185 *
2186 * Returns +true+ if the length of +self+ is zero, +false+ otherwise:
2187 *
2188 * "hello".empty? # => false
2189 * " ".empty? # => false
2190 * "".empty? # => true
2191 *
2192 */
2193
2194static VALUE
2195rb_str_empty(VALUE str)
2196{
2197 return RBOOL(RSTRING_LEN(str) == 0);
2198}
2199
2200/*
2201 * call-seq:
2202 * string + other_string -> new_string
2203 *
2204 * Returns a new \String containing +other_string+ concatenated to +self+:
2205 *
2206 * "Hello from " + self.to_s # => "Hello from main"
2207 *
2208 */
2209
2210VALUE
2212{
2213 VALUE str3;
2214 rb_encoding *enc;
2215 char *ptr1, *ptr2, *ptr3;
2216 long len1, len2;
2217 int termlen;
2218
2219 StringValue(str2);
2220 enc = rb_enc_check_str(str1, str2);
2221 RSTRING_GETMEM(str1, ptr1, len1);
2222 RSTRING_GETMEM(str2, ptr2, len2);
2223 termlen = rb_enc_mbminlen(enc);
2224 if (len1 > LONG_MAX - len2) {
2225 rb_raise(rb_eArgError, "string size too big");
2226 }
2227 str3 = str_new0(rb_cString, 0, len1+len2, termlen);
2228 ptr3 = RSTRING_PTR(str3);
2229 memcpy(ptr3, ptr1, len1);
2230 memcpy(ptr3+len1, ptr2, len2);
2231 TERM_FILL(&ptr3[len1+len2], termlen);
2232
2233 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
2235 RB_GC_GUARD(str1);
2236 RB_GC_GUARD(str2);
2237 return str3;
2238}
2239
2240/* A variant of rb_str_plus that does not raise but return Qundef instead. */
2241MJIT_FUNC_EXPORTED VALUE
2242rb_str_opt_plus(VALUE str1, VALUE str2)
2243{
2244 assert(RBASIC_CLASS(str1) == rb_cString);
2245 assert(RBASIC_CLASS(str2) == rb_cString);
2246 long len1, len2;
2247 MAYBE_UNUSED(char) *ptr1, *ptr2;
2248 RSTRING_GETMEM(str1, ptr1, len1);
2249 RSTRING_GETMEM(str2, ptr2, len2);
2250 int enc1 = rb_enc_get_index(str1);
2251 int enc2 = rb_enc_get_index(str2);
2252
2253 if (enc1 < 0) {
2254 return Qundef;
2255 }
2256 else if (enc2 < 0) {
2257 return Qundef;
2258 }
2259 else if (enc1 != enc2) {
2260 return Qundef;
2261 }
2262 else if (len1 > LONG_MAX - len2) {
2263 return Qundef;
2264 }
2265 else {
2266 return rb_str_plus(str1, str2);
2267 }
2268
2269}
2270
2271/*
2272 * call-seq:
2273 * string * integer -> new_string
2274 *
2275 * Returns a new \String containing +integer+ copies of +self+:
2276 *
2277 * "Ho! " * 3 # => "Ho! Ho! Ho! "
2278 * "Ho! " * 0 # => ""
2279 *
2280 */
2281
2282VALUE
2284{
2285 VALUE str2;
2286 long n, len;
2287 char *ptr2;
2288 int termlen;
2289
2290 if (times == INT2FIX(1)) {
2291 return str_duplicate(rb_cString, str);
2292 }
2293 if (times == INT2FIX(0)) {
2294 str2 = str_alloc_embed(rb_cString, 0);
2295 rb_enc_copy(str2, str);
2296 return str2;
2297 }
2298 len = NUM2LONG(times);
2299 if (len < 0) {
2300 rb_raise(rb_eArgError, "negative argument");
2301 }
2302 if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2303 if (STR_EMBEDDABLE_P(len, 1)) {
2304 str2 = str_alloc_embed(rb_cString, len + 1);
2305 memset(RSTRING_PTR(str2), 0, len + 1);
2306 }
2307 else {
2308 str2 = str_alloc_heap(rb_cString);
2309 RSTRING(str2)->as.heap.aux.capa = len;
2310 RSTRING(str2)->as.heap.ptr = ZALLOC_N(char, (size_t)len + 1);
2311 }
2312 STR_SET_LEN(str2, len);
2313 rb_enc_copy(str2, str);
2314 return str2;
2315 }
2316 if (len && LONG_MAX/len < RSTRING_LEN(str)) {
2317 rb_raise(rb_eArgError, "argument too big");
2318 }
2319
2320 len *= RSTRING_LEN(str);
2321 termlen = TERM_LEN(str);
2322 str2 = str_new0(rb_cString, 0, len, termlen);
2323 ptr2 = RSTRING_PTR(str2);
2324 if (len) {
2325 n = RSTRING_LEN(str);
2326 memcpy(ptr2, RSTRING_PTR(str), n);
2327 while (n <= len/2) {
2328 memcpy(ptr2 + n, ptr2, n);
2329 n *= 2;
2330 }
2331 memcpy(ptr2 + n, ptr2, len-n);
2332 }
2333 STR_SET_LEN(str2, len);
2334 TERM_FILL(&ptr2[len], termlen);
2335 rb_enc_cr_str_copy_for_substr(str2, str);
2336
2337 return str2;
2338}
2339
2340/*
2341 * call-seq:
2342 * string % object -> new_string
2343 *
2344 * Returns the result of formatting +object+ into the format specification +self+
2345 * (see Kernel#sprintf for formatting details):
2346 *
2347 * "%05d" % 123 # => "00123"
2348 *
2349 * If +self+ contains multiple substitutions, +object+ must be
2350 * an \Array or \Hash containing the values to be substituted:
2351 *
2352 * "%-5s: %016x" % [ "ID", self.object_id ] # => "ID : 00002b054ec93168"
2353 * "foo = %{foo}" % {foo: 'bar'} # => "foo = bar"
2354 * "foo = %{foo}, baz = %{baz}" % {foo: 'bar', baz: 'bat'} # => "foo = bar, baz = bat"
2355 *
2356 */
2357
2358static VALUE
2359rb_str_format_m(VALUE str, VALUE arg)
2360{
2361 VALUE tmp = rb_check_array_type(arg);
2362
2363 if (!NIL_P(tmp)) {
2364 return rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str);
2365 }
2366 return rb_str_format(1, &arg, str);
2367}
2368
2369static inline void
2370rb_check_lockedtmp(VALUE str)
2371{
2372 if (FL_TEST(str, STR_TMPLOCK)) {
2373 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
2374 }
2375}
2376
2377static inline void
2378str_modifiable(VALUE str)
2379{
2380 rb_check_lockedtmp(str);
2381 rb_check_frozen(str);
2382}
2383
2384static inline int
2385str_dependent_p(VALUE str)
2386{
2387 if (STR_EMBED_P(str) || !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2388 return 0;
2389 }
2390 else {
2391 return 1;
2392 }
2393}
2394
2395static inline int
2396str_independent(VALUE str)
2397{
2398 str_modifiable(str);
2399 return !str_dependent_p(str);
2400}
2401
2402static void
2403str_make_independent_expand(VALUE str, long len, long expand, const int termlen)
2404{
2405 char *ptr;
2406 char *oldptr;
2407 long capa = len + expand;
2408
2409 if (len > capa) len = capa;
2410
2411 if (!STR_EMBED_P(str) && str_embed_capa(str) >= capa + termlen) {
2412 ptr = RSTRING(str)->as.heap.ptr;
2413 STR_SET_EMBED(str);
2414 memcpy(RSTRING(str)->as.embed.ary, ptr, len);
2415 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
2416 STR_SET_EMBED_LEN(str, len);
2417 return;
2418 }
2419
2420 ptr = ALLOC_N(char, (size_t)capa + termlen);
2421 oldptr = RSTRING_PTR(str);
2422 if (oldptr) {
2423 memcpy(ptr, oldptr, len);
2424 }
2425 if (FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2426 xfree(oldptr);
2427 }
2428 STR_SET_NOEMBED(str);
2429 FL_UNSET(str, STR_SHARED|STR_NOFREE);
2430 TERM_FILL(ptr + len, termlen);
2431 RSTRING(str)->as.heap.ptr = ptr;
2432 RSTRING(str)->as.heap.len = len;
2433 RSTRING(str)->as.heap.aux.capa = capa;
2434}
2435
2436void
2438{
2439 if (!str_independent(str))
2440 str_make_independent(str);
2442}
2443
2444void
2446{
2447 int termlen = TERM_LEN(str);
2448 long len = RSTRING_LEN(str);
2449
2450 if (expand < 0) {
2451 rb_raise(rb_eArgError, "negative expanding string size");
2452 }
2453 if (expand >= LONG_MAX - len) {
2454 rb_raise(rb_eArgError, "string size too big");
2455 }
2456
2457 if (!str_independent(str)) {
2458 str_make_independent_expand(str, len, expand, termlen);
2459 }
2460 else if (expand > 0) {
2461 RESIZE_CAPA_TERM(str, len + expand, termlen);
2462 }
2464}
2465
2466/* As rb_str_modify(), but don't clear coderange */
2467static void
2468str_modify_keep_cr(VALUE str)
2469{
2470 if (!str_independent(str))
2471 str_make_independent(str);
2473 /* Force re-scan later */
2475}
2476
2477static inline void
2478str_discard(VALUE str)
2479{
2480 str_modifiable(str);
2481 if (!STR_EMBED_P(str) && !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2482 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2483 RSTRING(str)->as.heap.ptr = 0;
2484 RSTRING(str)->as.heap.len = 0;
2485 }
2486}
2487
2488void
2490{
2491 rb_encoding *enc = rb_enc_get(str);
2492 if (!enc) {
2493 rb_raise(rb_eTypeError, "not encoding capable object");
2494 }
2495 if (!rb_enc_asciicompat(enc)) {
2496 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
2497 }
2498}
2499
2500VALUE
2502{
2503 VALUE s = *ptr;
2504 if (!RB_TYPE_P(s, T_STRING)) {
2505 s = rb_str_to_str(s);
2506 *ptr = s;
2507 }
2508 return s;
2509}
2510
2511char *
2513{
2514 VALUE str = rb_string_value(ptr);
2515 return RSTRING_PTR(str);
2516}
2517
2518static int
2519zero_filled(const char *s, int n)
2520{
2521 for (; n > 0; --n) {
2522 if (*s++) return 0;
2523 }
2524 return 1;
2525}
2526
2527static const char *
2528str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
2529{
2530 const char *e = s + len;
2531
2532 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2533 if (zero_filled(s, minlen)) return s;
2534 }
2535 return 0;
2536}
2537
2538static char *
2539str_fill_term(VALUE str, char *s, long len, int termlen)
2540{
2541 /* This function assumes that (capa + termlen) bytes of memory
2542 * is allocated, like many other functions in this file.
2543 */
2544 if (str_dependent_p(str)) {
2545 if (!zero_filled(s + len, termlen))
2546 str_make_independent_expand(str, len, 0L, termlen);
2547 }
2548 else {
2549 TERM_FILL(s + len, termlen);
2550 return s;
2551 }
2552 return RSTRING_PTR(str);
2553}
2554
2555void
2556rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
2557{
2558 long capa = str_capacity(str, oldtermlen) + oldtermlen;
2559 long len = RSTRING_LEN(str);
2560
2561 assert(capa >= len);
2562 if (capa - len < termlen) {
2563 rb_check_lockedtmp(str);
2564 str_make_independent_expand(str, len, 0L, termlen);
2565 }
2566 else if (str_dependent_p(str)) {
2567 if (termlen > oldtermlen)
2568 str_make_independent_expand(str, len, 0L, termlen);
2569 }
2570 else {
2571 if (!STR_EMBED_P(str)) {
2572 /* modify capa instead of realloc */
2573 assert(!FL_TEST((str), STR_SHARED));
2574 RSTRING(str)->as.heap.aux.capa = capa - termlen;
2575 }
2576 if (termlen > oldtermlen) {
2577 TERM_FILL(RSTRING_PTR(str) + len, termlen);
2578 }
2579 }
2580
2581 return;
2582}
2583
2584static char *
2585str_null_check(VALUE str, int *w)
2586{
2587 char *s = RSTRING_PTR(str);
2588 long len = RSTRING_LEN(str);
2589 rb_encoding *enc = rb_enc_get(str);
2590 const int minlen = rb_enc_mbminlen(enc);
2591
2592 if (minlen > 1) {
2593 *w = 1;
2594 if (str_null_char(s, len, minlen, enc)) {
2595 return NULL;
2596 }
2597 return str_fill_term(str, s, len, minlen);
2598 }
2599 *w = 0;
2600 if (!s || memchr(s, 0, len)) {
2601 return NULL;
2602 }
2603 if (s[len]) {
2604 s = str_fill_term(str, s, len, minlen);
2605 }
2606 return s;
2607}
2608
2609char *
2610rb_str_to_cstr(VALUE str)
2611{
2612 int w;
2613 return str_null_check(str, &w);
2614}
2615
2616char *
2618{
2619 VALUE str = rb_string_value(ptr);
2620 int w;
2621 char *s = str_null_check(str, &w);
2622 if (!s) {
2623 if (w) {
2624 rb_raise(rb_eArgError, "string contains null char");
2625 }
2626 rb_raise(rb_eArgError, "string contains null byte");
2627 }
2628 return s;
2629}
2630
2631char *
2632rb_str_fill_terminator(VALUE str, const int newminlen)
2633{
2634 char *s = RSTRING_PTR(str);
2635 long len = RSTRING_LEN(str);
2636 return str_fill_term(str, s, len, newminlen);
2637}
2638
2639VALUE
2641{
2642 str = rb_check_convert_type_with_id(str, T_STRING, "String", idTo_str);
2643 return str;
2644}
2645
2646/*
2647 * call-seq:
2648 * String.try_convert(object) -> object, new_string, or nil
2649 *
2650 * If +object+ is a \String object, returns +object+.
2651 *
2652 * Otherwise if +object+ responds to <tt>:to_str</tt>,
2653 * calls <tt>object.to_str</tt> and returns the result.
2654 *
2655 * Returns +nil+ if +object+ does not respond to <tt>:to_str</tt>.
2656 *
2657 * Raises an exception unless <tt>object.to_str</tt> returns a \String object.
2658 */
2659static VALUE
2660rb_str_s_try_convert(VALUE dummy, VALUE str)
2661{
2662 return rb_check_string_type(str);
2663}
2664
2665static char*
2666str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
2667{
2668 long nth = *nthp;
2669 if (rb_enc_mbmaxlen(enc) == 1) {
2670 p += nth;
2671 }
2672 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2673 p += nth * rb_enc_mbmaxlen(enc);
2674 }
2675 else if (rb_enc_asciicompat(enc)) {
2676 const char *p2, *e2;
2677 int n;
2678
2679 while (p < e && 0 < nth) {
2680 e2 = p + nth;
2681 if (e < e2) {
2682 *nthp = nth;
2683 return (char *)e;
2684 }
2685 if (ISASCII(*p)) {
2686 p2 = search_nonascii(p, e2);
2687 if (!p2) {
2688 nth -= e2 - p;
2689 *nthp = nth;
2690 return (char *)e2;
2691 }
2692 nth -= p2 - p;
2693 p = p2;
2694 }
2695 n = rb_enc_mbclen(p, e, enc);
2696 p += n;
2697 nth--;
2698 }
2699 *nthp = nth;
2700 if (nth != 0) {
2701 return (char *)e;
2702 }
2703 return (char *)p;
2704 }
2705 else {
2706 while (p < e && nth--) {
2707 p += rb_enc_mbclen(p, e, enc);
2708 }
2709 }
2710 if (p > e) p = e;
2711 *nthp = nth;
2712 return (char*)p;
2713}
2714
2715char*
2716rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
2717{
2718 return str_nth_len(p, e, &nth, enc);
2719}
2720
2721static char*
2722str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2723{
2724 if (singlebyte)
2725 p += nth;
2726 else {
2727 p = str_nth_len(p, e, &nth, enc);
2728 }
2729 if (!p) return 0;
2730 if (p > e) p = e;
2731 return (char *)p;
2732}
2733
2734/* char offset to byte offset */
2735static long
2736str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2737{
2738 const char *pp = str_nth(p, e, nth, enc, singlebyte);
2739 if (!pp) return e - p;
2740 return pp - p;
2741}
2742
2743long
2744rb_str_offset(VALUE str, long pos)
2745{
2746 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
2747 STR_ENC_GET(str), single_byte_optimizable(str));
2748}
2749
2750#ifdef NONASCII_MASK
2751static char *
2752str_utf8_nth(const char *p, const char *e, long *nthp)
2753{
2754 long nth = *nthp;
2755 if ((int)SIZEOF_VOIDP * 2 < e - p && (int)SIZEOF_VOIDP * 2 < nth) {
2756 const uintptr_t *s, *t;
2757 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
2758 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2759 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2760 while (p < (const char *)s) {
2761 if (is_utf8_lead_byte(*p)) nth--;
2762 p++;
2763 }
2764 do {
2765 nth -= count_utf8_lead_bytes_with_word(s);
2766 s++;
2767 } while (s < t && (int)SIZEOF_VOIDP <= nth);
2768 p = (char *)s;
2769 }
2770 while (p < e) {
2771 if (is_utf8_lead_byte(*p)) {
2772 if (nth == 0) break;
2773 nth--;
2774 }
2775 p++;
2776 }
2777 *nthp = nth;
2778 return (char *)p;
2779}
2780
2781static long
2782str_utf8_offset(const char *p, const char *e, long nth)
2783{
2784 const char *pp = str_utf8_nth(p, e, &nth);
2785 return pp - p;
2786}
2787#endif
2788
2789/* byte offset to char offset */
2790long
2791rb_str_sublen(VALUE str, long pos)
2792{
2793 if (single_byte_optimizable(str) || pos < 0)
2794 return pos;
2795 else {
2796 char *p = RSTRING_PTR(str);
2797 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
2798 }
2799}
2800
2801static VALUE
2802str_subseq(VALUE str, long beg, long len)
2803{
2804 VALUE str2;
2805
2806 const long rstring_embed_capa_max = ((sizeof(struct RString) - offsetof(struct RString, as.embed.ary)) / sizeof(char)) - 1;
2807
2808 if (!SHARABLE_SUBSTRING_P(beg, len, RSTRING_LEN(str)) ||
2809 len <= rstring_embed_capa_max) {
2810 str2 = rb_str_new(RSTRING_PTR(str) + beg, len);
2811 RB_GC_GUARD(str);
2812 }
2813 else {
2814 str2 = str_new_shared(rb_cString, str);
2815 ENC_CODERANGE_CLEAR(str2);
2816 RSTRING(str2)->as.heap.ptr += beg;
2817 if (RSTRING(str2)->as.heap.len > len) {
2818 RSTRING(str2)->as.heap.len = len;
2819 }
2820 }
2821
2822 return str2;
2823}
2824
2825VALUE
2826rb_str_subseq(VALUE str, long beg, long len)
2827{
2828 VALUE str2 = str_subseq(str, beg, len);
2829 rb_enc_cr_str_copy_for_substr(str2, str);
2830 return str2;
2831}
2832
2833char *
2834rb_str_subpos(VALUE str, long beg, long *lenp)
2835{
2836 long len = *lenp;
2837 long slen = -1L;
2838 long blen = RSTRING_LEN(str);
2839 rb_encoding *enc = STR_ENC_GET(str);
2840 char *p, *s = RSTRING_PTR(str), *e = s + blen;
2841
2842 if (len < 0) return 0;
2843 if (!blen) {
2844 len = 0;
2845 }
2846 if (single_byte_optimizable(str)) {
2847 if (beg > blen) return 0;
2848 if (beg < 0) {
2849 beg += blen;
2850 if (beg < 0) return 0;
2851 }
2852 if (len > blen - beg)
2853 len = blen - beg;
2854 if (len < 0) return 0;
2855 p = s + beg;
2856 goto end;
2857 }
2858 if (beg < 0) {
2859 if (len > -beg) len = -beg;
2860 if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
2861 beg = -beg;
2862 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
2863 p = e;
2864 if (!p) return 0;
2865 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
2866 if (!p) return 0;
2867 len = e - p;
2868 goto end;
2869 }
2870 else {
2871 slen = str_strlen(str, enc);
2872 beg += slen;
2873 if (beg < 0) return 0;
2874 p = s + beg;
2875 if (len == 0) goto end;
2876 }
2877 }
2878 else if (beg > 0 && beg > RSTRING_LEN(str)) {
2879 return 0;
2880 }
2881 if (len == 0) {
2882 if (beg > str_strlen(str, enc)) return 0; /* str's enc */
2883 p = s + beg;
2884 }
2885#ifdef NONASCII_MASK
2886 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
2887 enc == rb_utf8_encoding()) {
2888 p = str_utf8_nth(s, e, &beg);
2889 if (beg > 0) return 0;
2890 len = str_utf8_offset(p, e, len);
2891 }
2892#endif
2893 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2894 int char_sz = rb_enc_mbmaxlen(enc);
2895
2896 p = s + beg * char_sz;
2897 if (p > e) {
2898 return 0;
2899 }
2900 else if (len * char_sz > e - p)
2901 len = e - p;
2902 else
2903 len *= char_sz;
2904 }
2905 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
2906 if (beg > 0) return 0;
2907 len = 0;
2908 }
2909 else {
2910 len = str_offset(p, e, len, enc, 0);
2911 }
2912 end:
2913 *lenp = len;
2914 RB_GC_GUARD(str);
2915 return p;
2916}
2917
2918static VALUE str_substr(VALUE str, long beg, long len, int empty);
2919
2920VALUE
2921rb_str_substr(VALUE str, long beg, long len)
2922{
2923 return str_substr(str, beg, len, TRUE);
2924}
2925
2926static VALUE
2927str_substr(VALUE str, long beg, long len, int empty)
2928{
2929 char *p = rb_str_subpos(str, beg, &len);
2930
2931 if (!p) return Qnil;
2932 if (!len && !empty) return Qnil;
2933
2934 beg = p - RSTRING_PTR(str);
2935
2936 VALUE str2 = str_subseq(str, beg, len);
2937 rb_enc_cr_str_copy_for_substr(str2, str);
2938 return str2;
2939}
2940
2941VALUE
2943{
2944 if (OBJ_FROZEN(str)) return str;
2945 rb_str_resize(str, RSTRING_LEN(str));
2946 return rb_obj_freeze(str);
2947}
2948
2949
2950/*
2951 * call-seq:
2952 * +string -> new_string or self
2953 *
2954 * Returns +self+ if +self+ is not frozen.
2955 *
2956 * Otherwise returns <tt>self.dup</tt>, which is not frozen.
2957 */
2958static VALUE
2959str_uplus(VALUE str)
2960{
2961 if (OBJ_FROZEN(str)) {
2962 return rb_str_dup(str);
2963 }
2964 else {
2965 return str;
2966 }
2967}
2968
2969/*
2970 * call-seq:
2971 * -string -> frozen_string
2972 *
2973 * Returns a frozen, possibly pre-existing copy of the string.
2974 *
2975 * The returned \String will be deduplicated as long as it does not have
2976 * any instance variables set on it and is not a String subclass.
2977 *
2978 * String#dedup is an alias for String#-@.
2979 */
2980static VALUE
2981str_uminus(VALUE str)
2982{
2983 if (!BARE_STRING_P(str) && !rb_obj_frozen_p(str)) {
2984 str = rb_str_dup(str);
2985 }
2986 return rb_fstring(str);
2987}
2988
2989RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
2990#define rb_str_dup_frozen rb_str_new_frozen
2991
2992VALUE
2994{
2995 if (FL_TEST(str, STR_TMPLOCK)) {
2996 rb_raise(rb_eRuntimeError, "temporal locking already locked string");
2997 }
2998 FL_SET(str, STR_TMPLOCK);
2999 return str;
3000}
3001
3002VALUE
3004{
3005 if (!FL_TEST(str, STR_TMPLOCK)) {
3006 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
3007 }
3008 FL_UNSET(str, STR_TMPLOCK);
3009 return str;
3010}
3011
3012RUBY_FUNC_EXPORTED VALUE
3013rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg)
3014{
3015 rb_str_locktmp(str);
3016 return rb_ensure(func, arg, rb_str_unlocktmp, str);
3017}
3018
3019void
3021{
3022 long capa;
3023 const int termlen = TERM_LEN(str);
3024
3025 str_modifiable(str);
3026 if (STR_SHARED_P(str)) {
3027 rb_raise(rb_eRuntimeError, "can't set length of shared string");
3028 }
3029 if (len > (capa = (long)str_capacity(str, termlen)) || len < 0) {
3030 rb_bug("probable buffer overflow: %ld for %ld", len, capa);
3031 }
3032 STR_SET_LEN(str, len);
3033 TERM_FILL(&RSTRING_PTR(str)[len], termlen);
3034}
3035
3036VALUE
3038{
3039 if (len < 0) {
3040 rb_raise(rb_eArgError, "negative string size (or size too big)");
3041 }
3042
3043 int independent = str_independent(str);
3044 long slen = RSTRING_LEN(str);
3045
3046 if (slen > len && ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
3048 }
3049
3050 {
3051 long capa;
3052 const int termlen = TERM_LEN(str);
3053 if (STR_EMBED_P(str)) {
3054 if (len == slen) return str;
3055 if (str_embed_capa(str) >= len + termlen) {
3056 STR_SET_EMBED_LEN(str, len);
3057 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3058 return str;
3059 }
3060 str_make_independent_expand(str, slen, len - slen, termlen);
3061 }
3062 else if (str_embed_capa(str) >= len + termlen) {
3063 char *ptr = STR_HEAP_PTR(str);
3064 STR_SET_EMBED(str);
3065 if (slen > len) slen = len;
3066 if (slen > 0) MEMCPY(RSTRING(str)->as.embed.ary, ptr, char, slen);
3067 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3068 STR_SET_EMBED_LEN(str, len);
3069 if (independent) ruby_xfree(ptr);
3070 return str;
3071 }
3072 else if (!independent) {
3073 if (len == slen) return str;
3074 str_make_independent_expand(str, slen, len - slen, termlen);
3075 }
3076 else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
3077 (capa - len) > (len < 1024 ? len : 1024)) {
3078 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
3079 (size_t)len + termlen, STR_HEAP_SIZE(str));
3080 RSTRING(str)->as.heap.aux.capa = len;
3081 }
3082 else if (len == slen) return str;
3083 RSTRING(str)->as.heap.len = len;
3084 TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */
3085 }
3086 return str;
3087}
3088
3089static VALUE
3090str_buf_cat4(VALUE str, const char *ptr, long len, bool keep_cr)
3091{
3092 if (keep_cr) {
3093 str_modify_keep_cr(str);
3094 }
3095 else {
3096 rb_str_modify(str);
3097 }
3098 if (len == 0) return 0;
3099
3100 long capa, total, olen, off = -1;
3101 char *sptr;
3102 const int termlen = TERM_LEN(str);
3103#if !USE_RVARGC
3104 assert(termlen < RSTRING_EMBED_LEN_MAX + 1); /* < (LONG_MAX/2) */
3105#endif
3106
3107 RSTRING_GETMEM(str, sptr, olen);
3108 if (ptr >= sptr && ptr <= sptr + olen) {
3109 off = ptr - sptr;
3110 }
3111
3112 if (STR_EMBED_P(str)) {
3113 capa = str_embed_capa(str) - termlen;
3114 sptr = RSTRING(str)->as.embed.ary;
3115 olen = RSTRING_EMBED_LEN(str);
3116 }
3117 else {
3118 capa = RSTRING(str)->as.heap.aux.capa;
3119 sptr = RSTRING(str)->as.heap.ptr;
3120 olen = RSTRING(str)->as.heap.len;
3121 }
3122 if (olen > LONG_MAX - len) {
3123 rb_raise(rb_eArgError, "string sizes too big");
3124 }
3125 total = olen + len;
3126 if (capa < total) {
3127 if (total >= LONG_MAX / 2) {
3128 capa = total;
3129 }
3130 while (total > capa) {
3131 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3132 }
3133 RESIZE_CAPA_TERM(str, capa, termlen);
3134 sptr = RSTRING_PTR(str);
3135 }
3136 if (off != -1) {
3137 ptr = sptr + off;
3138 }
3139 memcpy(sptr + olen, ptr, len);
3140 STR_SET_LEN(str, total);
3141 TERM_FILL(sptr + total, termlen); /* sentinel */
3142
3143 return str;
3144}
3145
3146#define str_buf_cat(str, ptr, len) str_buf_cat4((str), (ptr), len, false)
3147#define str_buf_cat2(str, ptr) str_buf_cat4((str), (ptr), rb_strlen_lit(ptr), false)
3148
3149VALUE
3150rb_str_cat(VALUE str, const char *ptr, long len)
3151{
3152 if (len == 0) return str;
3153 if (len < 0) {
3154 rb_raise(rb_eArgError, "negative string size (or size too big)");
3155 }
3156 return str_buf_cat(str, ptr, len);
3157}
3158
3159VALUE
3160rb_str_cat_cstr(VALUE str, const char *ptr)
3161{
3162 must_not_null(ptr);
3163 return rb_str_buf_cat(str, ptr, strlen(ptr));
3164}
3165
3166RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str, const char *ptr, long len), rb_str_cat, (str, ptr, len))
3167RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3168RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3169
3170static VALUE
3171rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
3172 int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
3173{
3174 int str_encindex = ENCODING_GET(str);
3175 int res_encindex;
3176 int str_cr, res_cr;
3177 rb_encoding *str_enc, *ptr_enc;
3178
3179 str_cr = RSTRING_LEN(str) ? ENC_CODERANGE(str) : ENC_CODERANGE_7BIT;
3180
3181 if (str_encindex == ptr_encindex) {
3182 if (str_cr != ENC_CODERANGE_UNKNOWN && ptr_cr == ENC_CODERANGE_UNKNOWN) {
3183 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
3184 }
3185 }
3186 else {
3187 str_enc = rb_enc_from_index(str_encindex);
3188 ptr_enc = rb_enc_from_index(ptr_encindex);
3189 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3190 if (len == 0)
3191 return str;
3192 if (RSTRING_LEN(str) == 0) {
3193 rb_str_buf_cat(str, ptr, len);
3194 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
3195 rb_str_change_terminator_length(str, rb_enc_mbminlen(str_enc), rb_enc_mbminlen(ptr_enc));
3196 return str;
3197 }
3198 goto incompatible;
3199 }
3200 if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
3201 ptr_cr = coderange_scan(ptr, len, ptr_enc);
3202 }
3203 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3204 if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
3205 str_cr = rb_enc_str_coderange(str);
3206 }
3207 }
3208 }
3209 if (ptr_cr_ret)
3210 *ptr_cr_ret = ptr_cr;
3211
3212 if (str_encindex != ptr_encindex &&
3213 str_cr != ENC_CODERANGE_7BIT &&
3214 ptr_cr != ENC_CODERANGE_7BIT) {
3215 str_enc = rb_enc_from_index(str_encindex);
3216 ptr_enc = rb_enc_from_index(ptr_encindex);
3217 goto incompatible;
3218 }
3219
3220 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3221 res_encindex = str_encindex;
3222 res_cr = ENC_CODERANGE_UNKNOWN;
3223 }
3224 else if (str_cr == ENC_CODERANGE_7BIT) {
3225 if (ptr_cr == ENC_CODERANGE_7BIT) {
3226 res_encindex = str_encindex;
3227 res_cr = ENC_CODERANGE_7BIT;
3228 }
3229 else {
3230 res_encindex = ptr_encindex;
3231 res_cr = ptr_cr;
3232 }
3233 }
3234 else if (str_cr == ENC_CODERANGE_VALID) {
3235 res_encindex = str_encindex;
3236 if (ENC_CODERANGE_CLEAN_P(ptr_cr))
3237 res_cr = str_cr;
3238 else
3239 res_cr = ptr_cr;
3240 }
3241 else { /* str_cr == ENC_CODERANGE_BROKEN */
3242 res_encindex = str_encindex;
3243 res_cr = str_cr;
3244 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
3245 }
3246
3247 if (len < 0) {
3248 rb_raise(rb_eArgError, "negative string size (or size too big)");
3249 }
3250 str_buf_cat(str, ptr, len);
3251 ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
3252 return str;
3253
3254 incompatible:
3255 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
3256 rb_enc_name(str_enc), rb_enc_name(ptr_enc));
3258}
3259
3260VALUE
3261rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
3262{
3263 return rb_enc_cr_str_buf_cat(str, ptr, len,
3264 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
3265}
3266
3267VALUE
3269{
3270 /* ptr must reference NUL terminated ASCII string. */
3271 int encindex = ENCODING_GET(str);
3272 rb_encoding *enc = rb_enc_from_index(encindex);
3273 if (rb_enc_asciicompat(enc)) {
3274 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
3275 encindex, ENC_CODERANGE_7BIT, 0);
3276 }
3277 else {
3278 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
3279 while (*ptr) {
3280 unsigned int c = (unsigned char)*ptr;
3281 int len = rb_enc_codelen(c, enc);
3282 rb_enc_mbcput(c, buf, enc);
3283 rb_enc_cr_str_buf_cat(str, buf, len,
3284 encindex, ENC_CODERANGE_VALID, 0);
3285 ptr++;
3286 }
3287 return str;
3288 }
3289}
3290
3291VALUE
3293{
3294 int str2_cr = rb_enc_str_coderange(str2);
3295
3296 if (str_enc_fastpath(str)) {
3297 switch (str2_cr) {
3298 case ENC_CODERANGE_7BIT:
3299 // If RHS is 7bit we can do simple concatenation
3300 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3301 return str;
3303 // If RHS is valid, we can do simple concatenation if encodings are the same
3304 if (ENCODING_GET_INLINED(str) == ENCODING_GET_INLINED(str2)) {
3305 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3306 int str_cr = ENC_CODERANGE(str);
3307 if (UNLIKELY(str_cr != ENC_CODERANGE_VALID)) {
3308 ENC_CODERANGE_SET(str, RB_ENC_CODERANGE_AND(str_cr, str2_cr));
3309 }
3310 return str;
3311 }
3312 }
3313 }
3314
3315 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3316 ENCODING_GET(str2), str2_cr, &str2_cr);
3317
3318 ENC_CODERANGE_SET(str2, str2_cr);
3319
3320 return str;
3321}
3322
3323VALUE
3325{
3326 StringValue(str2);
3327 return rb_str_buf_append(str, str2);
3328}
3329
3330#define MIN_PRE_ALLOC_SIZE 48
3331
3332MJIT_FUNC_EXPORTED VALUE
3333rb_str_concat_literals(size_t num, const VALUE *strary)
3334{
3335 VALUE str;
3336 size_t i, s;
3337 long len = 1;
3338
3339 if (UNLIKELY(!num)) return rb_str_new(0, 0);
3340 if (UNLIKELY(num == 1)) return rb_str_resurrect(strary[0]);
3341
3342 for (i = 0; i < num; ++i) { len += RSTRING_LEN(strary[i]); }
3343 if (LIKELY(len < MIN_PRE_ALLOC_SIZE)) {
3344 str = rb_str_resurrect(strary[0]);
3345 s = 1;
3346 }
3347 else {
3348 str = rb_str_buf_new(len);
3349 rb_enc_copy(str, strary[0]);
3350 s = 0;
3351 }
3352
3353 for (i = s; i < num; ++i) {
3354 const VALUE v = strary[i];
3355 int encidx = ENCODING_GET(v);
3356
3357 rb_str_buf_append(str, v);
3358 if (encidx != ENCINDEX_US_ASCII) {
3359 if (ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)
3360 rb_enc_set_index(str, encidx);
3361 }
3362 }
3363 return str;
3364}
3365
3366/*
3367 * call-seq:
3368 * concat(*objects) -> string
3369 *
3370 * Concatenates each object in +objects+ to +self+ and returns +self+:
3371 *
3372 * s = 'foo'
3373 * s.concat('bar', 'baz') # => "foobarbaz"
3374 * s # => "foobarbaz"
3375 *
3376 * For each given object +object+ that is an \Integer,
3377 * the value is considered a codepoint and converted to a character before concatenation:
3378 *
3379 * s = 'foo'
3380 * s.concat(32, 'bar', 32, 'baz') # => "foo bar baz"
3381 *
3382 * Related: String#<<, which takes a single argument.
3383 */
3384static VALUE
3385rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
3386{
3387 str_modifiable(str);
3388
3389 if (argc == 1) {
3390 return rb_str_concat(str, argv[0]);
3391 }
3392 else if (argc > 1) {
3393 int i;
3394 VALUE arg_str = rb_str_tmp_new(0);
3395 rb_enc_copy(arg_str, str);
3396 for (i = 0; i < argc; i++) {
3397 rb_str_concat(arg_str, argv[i]);
3398 }
3399 rb_str_buf_append(str, arg_str);
3400 }
3401
3402 return str;
3403}
3404
3405/*
3406 * call-seq:
3407 * string << object -> string
3408 *
3409 * Concatenates +object+ to +self+ and returns +self+:
3410 *
3411 * s = 'foo'
3412 * s << 'bar' # => "foobar"
3413 * s # => "foobar"
3414 *
3415 * If +object+ is an \Integer,
3416 * the value is considered a codepoint and converted to a character before concatenation:
3417 *
3418 * s = 'foo'
3419 * s << 33 # => "foo!"
3420 *
3421 * Related: String#concat, which takes multiple arguments.
3422 */
3423VALUE
3425{
3426 unsigned int code;
3427 rb_encoding *enc = STR_ENC_GET(str1);
3428 int encidx;
3429
3430 if (RB_INTEGER_TYPE_P(str2)) {
3431 if (rb_num_to_uint(str2, &code) == 0) {
3432 }
3433 else if (FIXNUM_P(str2)) {
3434 rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
3435 }
3436 else {
3437 rb_raise(rb_eRangeError, "bignum out of char range");
3438 }
3439 }
3440 else {
3441 return rb_str_append(str1, str2);
3442 }
3443
3444 encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
3445 if (encidx >= 0) {
3446 char buf[1];
3447 buf[0] = (char)code;
3448 rb_str_cat(str1, buf, 1);
3449 if (encidx != rb_enc_to_index(enc)) {
3450 rb_enc_associate_index(str1, encidx);
3452 }
3453 }
3454 else {
3455 long pos = RSTRING_LEN(str1);
3456 int cr = ENC_CODERANGE(str1);
3457 int len;
3458 char *buf;
3459
3460 switch (len = rb_enc_codelen(code, enc)) {
3461 case ONIGERR_INVALID_CODE_POINT_VALUE:
3462 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3463 break;
3464 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
3465 case 0:
3466 rb_raise(rb_eRangeError, "%u out of char range", code);
3467 break;
3468 }
3469 buf = ALLOCA_N(char, len + 1);
3470 rb_enc_mbcput(code, buf, enc);
3471 if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
3472 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3473 }
3474 rb_str_resize(str1, pos+len);
3475 memcpy(RSTRING_PTR(str1) + pos, buf, len);
3476 if (cr == ENC_CODERANGE_7BIT && code > 127)
3478 ENC_CODERANGE_SET(str1, cr);
3479 }
3480 return str1;
3481}
3482
3483int
3484rb_ascii8bit_appendable_encoding_index(rb_encoding *enc, unsigned int code)
3485{
3486 int encidx = rb_enc_to_index(enc);
3487
3488 if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) {
3489 /* US-ASCII automatically extended to ASCII-8BIT */
3490 if (code > 0xFF) {
3491 rb_raise(rb_eRangeError, "%u out of char range", code);
3492 }
3493 if (encidx == ENCINDEX_US_ASCII && code > 127) {
3494 return ENCINDEX_ASCII_8BIT;
3495 }
3496 return encidx;
3497 }
3498 else {
3499 return -1;
3500 }
3501}
3502
3503/*
3504 * call-seq:
3505 * prepend(*other_strings) -> string
3506 *
3507 * Prepends each string in +other_strings+ to +self+ and returns +self+:
3508 *
3509 * s = 'foo'
3510 * s.prepend('bar', 'baz') # => "barbazfoo"
3511 * s # => "barbazfoo"
3512 *
3513 * Related: String#concat.
3514 */
3515
3516static VALUE
3517rb_str_prepend_multi(int argc, VALUE *argv, VALUE str)
3518{
3519 str_modifiable(str);
3520
3521 if (argc == 1) {
3522 rb_str_update(str, 0L, 0L, argv[0]);
3523 }
3524 else if (argc > 1) {
3525 int i;
3526 VALUE arg_str = rb_str_tmp_new(0);
3527 rb_enc_copy(arg_str, str);
3528 for (i = 0; i < argc; i++) {
3529 rb_str_append(arg_str, argv[i]);
3530 }
3531 rb_str_update(str, 0L, 0L, arg_str);
3532 }
3533
3534 return str;
3535}
3536
3537st_index_t
3539{
3540 int e = ENCODING_GET(str);
3541 if (e && is_ascii_string(str)) {
3542 e = 0;
3543 }
3544 return rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str)) ^ e;
3545}
3546
3547int
3549{
3550 long len1, len2;
3551 const char *ptr1, *ptr2;
3552 RSTRING_GETMEM(str1, ptr1, len1);
3553 RSTRING_GETMEM(str2, ptr2, len2);
3554 return (len1 != len2 ||
3555 !rb_str_comparable(str1, str2) ||
3556 memcmp(ptr1, ptr2, len1) != 0);
3557}
3558
3559/*
3560 * call-seq:
3561 * hash -> integer
3562 *
3563 * Returns the integer hash value for +self+.
3564 * The value is based on the length, content and encoding of +self+.
3565 *
3566 * Related: Object#hash.
3567 */
3568
3569static VALUE
3570rb_str_hash_m(VALUE str)
3571{
3572 st_index_t hval = rb_str_hash(str);
3573 return ST2FIX(hval);
3574}
3575
3576#define lesser(a,b) (((a)>(b))?(b):(a))
3577
3578int
3580{
3581 int idx1, idx2;
3582 int rc1, rc2;
3583
3584 if (RSTRING_LEN(str1) == 0) return TRUE;
3585 if (RSTRING_LEN(str2) == 0) return TRUE;
3586 idx1 = ENCODING_GET(str1);
3587 idx2 = ENCODING_GET(str2);
3588 if (idx1 == idx2) return TRUE;
3589 rc1 = rb_enc_str_coderange(str1);
3590 rc2 = rb_enc_str_coderange(str2);
3591 if (rc1 == ENC_CODERANGE_7BIT) {
3592 if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
3593 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
3594 return TRUE;
3595 }
3596 if (rc2 == ENC_CODERANGE_7BIT) {
3597 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
3598 return TRUE;
3599 }
3600 return FALSE;
3601}
3602
3603int
3605{
3606 long len1, len2;
3607 const char *ptr1, *ptr2;
3608 int retval;
3609
3610 if (str1 == str2) return 0;
3611 RSTRING_GETMEM(str1, ptr1, len1);
3612 RSTRING_GETMEM(str2, ptr2, len2);
3613 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
3614 if (len1 == len2) {
3615 if (!rb_str_comparable(str1, str2)) {
3616 if (ENCODING_GET(str1) > ENCODING_GET(str2))
3617 return 1;
3618 return -1;
3619 }
3620 return 0;
3621 }
3622 if (len1 > len2) return 1;
3623 return -1;
3624 }
3625 if (retval > 0) return 1;
3626 return -1;
3627}
3628
3629/*
3630 * call-seq:
3631 * string == object -> true or false
3632 * string === object -> true or false
3633 *
3634 * Returns +true+ if +object+ has the same length and content;
3635 * as +self+; +false+ otherwise:
3636 *
3637 * s = 'foo'
3638 * s == 'foo' # => true
3639 * s == 'food' # => false
3640 * s == 'FOO' # => false
3641 *
3642 * Returns +false+ if the two strings' encodings are not compatible:
3643 * "\u{e4 f6 fc}".encode("ISO-8859-1") == ("\u{c4 d6 dc}") # => false
3644 *
3645 * If +object+ is not an instance of \String but responds to +to_str+, then the
3646 * two strings are compared using <code>object.==</code>.
3647 */
3648
3649VALUE
3651{
3652 if (str1 == str2) return Qtrue;
3653 if (!RB_TYPE_P(str2, T_STRING)) {
3654 if (!rb_respond_to(str2, idTo_str)) {
3655 return Qfalse;
3656 }
3657 return rb_equal(str2, str1);
3658 }
3659 return rb_str_eql_internal(str1, str2);
3660}
3661
3662/*
3663 * call-seq:
3664 * eql?(object) -> true or false
3665 *
3666 * Returns +true+ if +object+ has the same length and content;
3667 * as +self+; +false+ otherwise:
3668 *
3669 * s = 'foo'
3670 * s.eql?('foo') # => true
3671 * s.eql?('food') # => false
3672 * s.eql?('FOO') # => false
3673 *
3674 * Returns +false+ if the two strings' encodings are not compatible:
3675 *
3676 * "\u{e4 f6 fc}".encode("ISO-8859-1").eql?("\u{c4 d6 dc}") # => false
3677 *
3678 */
3679
3680MJIT_FUNC_EXPORTED VALUE
3681rb_str_eql(VALUE str1, VALUE str2)
3682{
3683 if (str1 == str2) return Qtrue;
3684 if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
3685 return rb_str_eql_internal(str1, str2);
3686}
3687
3688/*
3689 * call-seq:
3690 * string <=> other_string -> -1, 0, 1, or nil
3691 *
3692 * Compares +self+ and +other_string+, returning:
3693 *
3694 * - -1 if +other_string+ is larger.
3695 * - 0 if the two are equal.
3696 * - 1 if +other_string+ is smaller.
3697 * - +nil+ if the two are incomparable.
3698 *
3699 * Examples:
3700 *
3701 * 'foo' <=> 'foo' # => 0
3702 * 'foo' <=> 'food' # => -1
3703 * 'food' <=> 'foo' # => 1
3704 * 'FOO' <=> 'foo' # => -1
3705 * 'foo' <=> 'FOO' # => 1
3706 * 'foo' <=> 1 # => nil
3707 *
3708 */
3709
3710static VALUE
3711rb_str_cmp_m(VALUE str1, VALUE str2)
3712{
3713 int result;
3714 VALUE s = rb_check_string_type(str2);
3715 if (NIL_P(s)) {
3716 return rb_invcmp(str1, str2);
3717 }
3718 result = rb_str_cmp(str1, s);
3719 return INT2FIX(result);
3720}
3721
3722static VALUE str_casecmp(VALUE str1, VALUE str2);
3723static VALUE str_casecmp_p(VALUE str1, VALUE str2);
3724
3725/*
3726 * call-seq:
3727 * casecmp(other_string) -> -1, 0, 1, or nil
3728 *
3729 * Compares <tt>self.downcase</tt> and <tt>other_string.downcase</tt>; returns:
3730 *
3731 * - -1 if <tt>other_string.downcase</tt> is larger.
3732 * - 0 if the two are equal.
3733 * - 1 if <tt>other_string.downcase</tt> is smaller.
3734 * - +nil+ if the two are incomparable.
3735 *
3736 * Examples:
3737 *
3738 * 'foo'.casecmp('foo') # => 0
3739 * 'foo'.casecmp('food') # => -1
3740 * 'food'.casecmp('foo') # => 1
3741 * 'FOO'.casecmp('foo') # => 0
3742 * 'foo'.casecmp('FOO') # => 0
3743 * 'foo'.casecmp(1) # => nil
3744 *
3745 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
3746 *
3747 * Related: String#casecmp?.
3748 *
3749 */
3750
3751static VALUE
3752rb_str_casecmp(VALUE str1, VALUE str2)
3753{
3754 VALUE s = rb_check_string_type(str2);
3755 if (NIL_P(s)) {
3756 return Qnil;
3757 }
3758 return str_casecmp(str1, s);
3759}
3760
3761static VALUE
3762str_casecmp(VALUE str1, VALUE str2)
3763{
3764 long len;
3765 rb_encoding *enc;
3766 const char *p1, *p1end, *p2, *p2end;
3767
3768 enc = rb_enc_compatible(str1, str2);
3769 if (!enc) {
3770 return Qnil;
3771 }
3772
3773 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
3774 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
3775 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
3776 while (p1 < p1end && p2 < p2end) {
3777 if (*p1 != *p2) {
3778 unsigned int c1 = TOLOWER(*p1 & 0xff);
3779 unsigned int c2 = TOLOWER(*p2 & 0xff);
3780 if (c1 != c2)
3781 return INT2FIX(c1 < c2 ? -1 : 1);
3782 }
3783 p1++;
3784 p2++;
3785 }
3786 }
3787 else {
3788 while (p1 < p1end && p2 < p2end) {
3789 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
3790 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
3791
3792 if (0 <= c1 && 0 <= c2) {
3793 c1 = TOLOWER(c1);
3794 c2 = TOLOWER(c2);
3795 if (c1 != c2)
3796 return INT2FIX(c1 < c2 ? -1 : 1);
3797 }
3798 else {
3799 int r;
3800 l1 = rb_enc_mbclen(p1, p1end, enc);
3801 l2 = rb_enc_mbclen(p2, p2end, enc);
3802 len = l1 < l2 ? l1 : l2;
3803 r = memcmp(p1, p2, len);
3804 if (r != 0)
3805 return INT2FIX(r < 0 ? -1 : 1);
3806 if (l1 != l2)
3807 return INT2FIX(l1 < l2 ? -1 : 1);
3808 }
3809 p1 += l1;
3810 p2 += l2;
3811 }
3812 }
3813 if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
3814 if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
3815 return INT2FIX(-1);
3816}
3817
3818/*
3819 * call-seq:
3820 * casecmp?(other_string) -> true, false, or nil
3821 *
3822 * Returns +true+ if +self+ and +other_string+ are equal after
3823 * Unicode case folding, otherwise +false+:
3824 *
3825 * 'foo'.casecmp?('foo') # => true
3826 * 'foo'.casecmp?('food') # => false
3827 * 'food'.casecmp?('foo') # => false
3828 * 'FOO'.casecmp?('foo') # => true
3829 * 'foo'.casecmp?('FOO') # => true
3830 *
3831 * Returns +nil+ if the two values are incomparable:
3832 *
3833 * 'foo'.casecmp?(1) # => nil
3834 *
3835 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
3836 *
3837 * Related: String#casecmp.
3838 *
3839 */
3840
3841static VALUE
3842rb_str_casecmp_p(VALUE str1, VALUE str2)
3843{
3844 VALUE s = rb_check_string_type(str2);
3845 if (NIL_P(s)) {
3846 return Qnil;
3847 }
3848 return str_casecmp_p(str1, s);
3849}
3850
3851static VALUE
3852str_casecmp_p(VALUE str1, VALUE str2)
3853{
3854 rb_encoding *enc;
3855 VALUE folded_str1, folded_str2;
3856 VALUE fold_opt = sym_fold;
3857
3858 enc = rb_enc_compatible(str1, str2);
3859 if (!enc) {
3860 return Qnil;
3861 }
3862
3863 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
3864 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
3865
3866 return rb_str_eql(folded_str1, folded_str2);
3867}
3868
3869static long
3870strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
3871 const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
3872{
3873 const char *search_start = str_ptr;
3874 long pos, search_len = str_len - offset;
3875
3876 for (;;) {
3877 const char *t;
3878 pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
3879 if (pos < 0) return pos;
3880 t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
3881 if (t == search_start + pos) break;
3882 search_len -= t - search_start;
3883 if (search_len <= 0) return -1;
3884 offset += t - search_start;
3885 search_start = t;
3886 }
3887 return pos + offset;
3888}
3889
3890#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
3891
3892static long
3893rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
3894{
3895 const char *str_ptr, *str_ptr_end, *sub_ptr;
3896 long str_len, sub_len;
3897 rb_encoding *enc;
3898
3899 enc = rb_enc_check(str, sub);
3900 if (is_broken_string(sub)) return -1;
3901
3902 str_ptr = RSTRING_PTR(str);
3903 str_ptr_end = RSTRING_END(str);
3904 str_len = RSTRING_LEN(str);
3905 sub_ptr = RSTRING_PTR(sub);
3906 sub_len = RSTRING_LEN(sub);
3907
3908 if (str_len < sub_len) return -1;
3909
3910 if (offset != 0) {
3911 long str_len_char, sub_len_char;
3912 int single_byte = single_byte_optimizable(str);
3913 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
3914 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
3915 if (offset < 0) {
3916 offset += str_len_char;
3917 if (offset < 0) return -1;
3918 }
3919 if (str_len_char - offset < sub_len_char) return -1;
3920 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
3921 str_ptr += offset;
3922 }
3923 if (sub_len == 0) return offset;
3924
3925 /* need proceed one character at a time */
3926 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
3927}
3928
3929
3930/*
3931 * call-seq:
3932 * index(substring, offset = 0) -> integer or nil
3933 * index(regexp, offset = 0) -> integer or nil
3934 *
3935 * :include: doc/string/index.rdoc
3936 *
3937 */
3938
3939static VALUE
3940rb_str_index_m(int argc, VALUE *argv, VALUE str)
3941{
3942 VALUE sub;
3943 VALUE initpos;
3944 long pos;
3945
3946 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
3947 pos = NUM2LONG(initpos);
3948 }
3949 else {
3950 pos = 0;
3951 }
3952 if (pos < 0) {
3953 pos += str_strlen(str, NULL);
3954 if (pos < 0) {
3955 if (RB_TYPE_P(sub, T_REGEXP)) {
3957 }
3958 return Qnil;
3959 }
3960 }
3961
3962 if (RB_TYPE_P(sub, T_REGEXP)) {
3963 if (pos > str_strlen(str, NULL))
3964 return Qnil;
3965 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
3966 rb_enc_check(str, sub), single_byte_optimizable(str));
3967
3968 if (rb_reg_search(sub, str, pos, 0) < 0) {
3969 return Qnil;
3970 }
3971 else {
3972 VALUE match = rb_backref_get();
3973 struct re_registers *regs = RMATCH_REGS(match);
3974 pos = rb_str_sublen(str, BEG(0));
3975 return LONG2NUM(pos);
3976 }
3977 }
3978 else {
3979 StringValue(sub);
3980 pos = rb_str_index(str, sub, pos);
3981 pos = rb_str_sublen(str, pos);
3982 }
3983
3984 if (pos == -1) return Qnil;
3985 return LONG2NUM(pos);
3986}
3987
3988/* whether given pos is valid character boundary or not
3989 * Note that in this function, "character" means a code point
3990 * (Unicode scalar value), not a grapheme cluster.
3991 */
3992static bool
3993str_check_byte_pos(VALUE str, long pos)
3994{
3995 const char *s = RSTRING_PTR(str);
3996 const char *e = RSTRING_END(str);
3997 const char *p = s + pos;
3998 const char *pp = rb_enc_left_char_head(s, p, e, rb_enc_get(str));
3999 return p == pp;
4000}
4001
4002/*
4003 * call-seq:
4004 * byteindex(substring, offset = 0) -> integer or nil
4005 * byteindex(regexp, offset = 0) -> integer or nil
4006 *
4007 * Returns the \Integer byte-based index of the first occurrence of the given +substring+,
4008 * or +nil+ if none found:
4009 *
4010 * 'foo'.byteindex('f') # => 0
4011 * 'foo'.byteindex('o') # => 1
4012 * 'foo'.byteindex('oo') # => 1
4013 * 'foo'.byteindex('ooo') # => nil
4014 *
4015 * Returns the \Integer byte-based index of the first match for the given \Regexp +regexp+,
4016 * or +nil+ if none found:
4017 *
4018 * 'foo'.byteindex(/f/) # => 0
4019 * 'foo'.byteindex(/o/) # => 1
4020 * 'foo'.byteindex(/oo/) # => 1
4021 * 'foo'.byteindex(/ooo/) # => nil
4022 *
4023 * \Integer argument +offset+, if given, specifies the byte-based position in the
4024 * string to begin the search:
4025 *
4026 * 'foo'.byteindex('o', 1) # => 1
4027 * 'foo'.byteindex('o', 2) # => 2
4028 * 'foo'.byteindex('o', 3) # => nil
4029 *
4030 * If +offset+ is negative, counts backward from the end of +self+:
4031 *
4032 * 'foo'.byteindex('o', -1) # => 2
4033 * 'foo'.byteindex('o', -2) # => 1
4034 * 'foo'.byteindex('o', -3) # => 1
4035 * 'foo'.byteindex('o', -4) # => nil
4036 *
4037 * If +offset+ does not land on character (codepoint) boundary, +IndexError+ is
4038 * raised.
4039 *
4040 * Related: String#index, String#byterindex.
4041 */
4042
4043static VALUE
4044rb_str_byteindex_m(int argc, VALUE *argv, VALUE str)
4045{
4046 VALUE sub;
4047 VALUE initpos;
4048 long pos;
4049
4050 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4051 pos = NUM2LONG(initpos);
4052 }
4053 else {
4054 pos = 0;
4055 }
4056 if (pos < 0) {
4057 pos += RSTRING_LEN(str);
4058 if (pos < 0) {
4059 if (RB_TYPE_P(sub, T_REGEXP)) {
4061 }
4062 return Qnil;
4063 }
4064 }
4065
4066 if (!str_check_byte_pos(str, pos)) {
4068 "offset %ld does not land on character boundary", pos);
4069 }
4070
4071 if (RB_TYPE_P(sub, T_REGEXP)) {
4072 if (pos > RSTRING_LEN(str))
4073 return Qnil;
4074 if (rb_reg_search(sub, str, pos, 0) < 0) {
4075 return Qnil;
4076 }
4077 else {
4078 VALUE match = rb_backref_get();
4079 struct re_registers *regs = RMATCH_REGS(match);
4080 pos = BEG(0);
4081 return LONG2NUM(pos);
4082 }
4083 }
4084 else {
4085 StringValue(sub);
4086 pos = rb_strseq_index(str, sub, pos, 1);
4087 }
4088
4089 if (pos == -1) return Qnil;
4090 return LONG2NUM(pos);
4091}
4092
4093#ifdef HAVE_MEMRCHR
4094static long
4095str_rindex(VALUE str, VALUE sub, const char *s, rb_encoding *enc)
4096{
4097 char *hit, *adjusted;
4098 int c;
4099 long slen, searchlen;
4100 char *sbeg, *e, *t;
4101
4102 sbeg = RSTRING_PTR(str);
4103 slen = RSTRING_LEN(sub);
4104 if (slen == 0) return s - sbeg;
4105 e = RSTRING_END(str);
4106 t = RSTRING_PTR(sub);
4107 c = *t & 0xff;
4108 searchlen = s - sbeg + 1;
4109
4110 do {
4111 hit = memrchr(sbeg, c, searchlen);
4112 if (!hit) break;
4113 adjusted = rb_enc_left_char_head(sbeg, hit, e, enc);
4114 if (hit != adjusted) {
4115 searchlen = adjusted - sbeg;
4116 continue;
4117 }
4118 if (memcmp(hit, t, slen) == 0)
4119 return hit - sbeg;
4120 searchlen = adjusted - sbeg;
4121 } while (searchlen > 0);
4122
4123 return -1;
4124}
4125#else
4126static long
4127str_rindex(VALUE str, VALUE sub, const char *s, rb_encoding *enc)
4128{
4129 long slen;
4130 char *sbeg, *e, *t;
4131
4132 sbeg = RSTRING_PTR(str);
4133 e = RSTRING_END(str);
4134 t = RSTRING_PTR(sub);
4135 slen = RSTRING_LEN(sub);
4136
4137 while (s) {
4138 if (memcmp(s, t, slen) == 0) {
4139 return s - sbeg;
4140 }
4141 if (s <= sbeg) break;
4142 s = rb_enc_prev_char(sbeg, s, e, enc);
4143 }
4144
4145 return -1;
4146}
4147#endif
4148
4149static long
4150rb_str_rindex(VALUE str, VALUE sub, long pos)
4151{
4152 long len, slen;
4153 char *sbeg, *s;
4154 rb_encoding *enc;
4155 int singlebyte;
4156
4157 enc = rb_enc_check(str, sub);
4158 if (is_broken_string(sub)) return -1;
4159 singlebyte = single_byte_optimizable(str);
4160 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */
4161 slen = str_strlen(sub, enc); /* rb_enc_check */
4162
4163 /* substring longer than string */
4164 if (len < slen) return -1;
4165 if (len - pos < slen) pos = len - slen;
4166 if (len == 0) return pos;
4167
4168 sbeg = RSTRING_PTR(str);
4169
4170 if (pos == 0) {
4171 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4172 return 0;
4173 else
4174 return -1;
4175 }
4176
4177 s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
4178 return rb_str_sublen(str, str_rindex(str, sub, s, enc));
4179}
4180
4181/*
4182 * call-seq:
4183 * rindex(substring, offset = self.length) -> integer or nil
4184 * rindex(regexp, offset = self.length) -> integer or nil
4185 *
4186 * Returns the \Integer index of the _last_ occurrence of the given +substring+,
4187 * or +nil+ if none found:
4188 *
4189 * 'foo'.rindex('f') # => 0
4190 * 'foo'.rindex('o') # => 2
4191 * 'foo'.rindex('oo') # => 1
4192 * 'foo'.rindex('ooo') # => nil
4193 *
4194 * Returns the \Integer index of the _last_ match for the given \Regexp +regexp+,
4195 * or +nil+ if none found:
4196 *
4197 * 'foo'.rindex(/f/) # => 0
4198 * 'foo'.rindex(/o/) # => 2
4199 * 'foo'.rindex(/oo/) # => 1
4200 * 'foo'.rindex(/ooo/) # => nil
4201 *
4202 * The _last_ match means starting at the possible last position, not
4203 * the last of longest matches.
4204 *
4205 * 'foo'.rindex(/o+/) # => 2
4206 * $~ #=> #<MatchData "o">
4207 *
4208 * To get the last longest match, needs to combine with negative
4209 * lookbehind.
4210 *
4211 * 'foo'.rindex(/(?<!o)o+/) # => 1
4212 * $~ #=> #<MatchData "oo">
4213 *
4214 * Or String#index with negative lookforward.
4215 *
4216 * 'foo'.index(/o+(?!.*o)/) # => 1
4217 * $~ #=> #<MatchData "oo">
4218 *
4219 * \Integer argument +offset+, if given and non-negative, specifies the maximum starting position in the
4220 * string to _end_ the search:
4221 *
4222 * 'foo'.rindex('o', 0) # => nil
4223 * 'foo'.rindex('o', 1) # => 1
4224 * 'foo'.rindex('o', 2) # => 2
4225 * 'foo'.rindex('o', 3) # => 2
4226 *
4227 * If +offset+ is a negative \Integer, the maximum starting position in the
4228 * string to _end_ the search is the sum of the string's length and +offset+:
4229 *
4230 * 'foo'.rindex('o', -1) # => 2
4231 * 'foo'.rindex('o', -2) # => 1
4232 * 'foo'.rindex('o', -3) # => nil
4233 * 'foo'.rindex('o', -4) # => nil
4234 *
4235 * Related: String#index.
4236 */
4237
4238static VALUE
4239rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
4240{
4241 VALUE sub;
4242 VALUE vpos;
4243 rb_encoding *enc = STR_ENC_GET(str);
4244 long pos, len = str_strlen(str, enc); /* str's enc */
4245
4246 if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) {
4247 pos = NUM2LONG(vpos);
4248 if (pos < 0) {
4249 pos += len;
4250 if (pos < 0) {
4251 if (RB_TYPE_P(sub, T_REGEXP)) {
4253 }
4254 return Qnil;
4255 }
4256 }
4257 if (pos > len) pos = len;
4258 }
4259 else {
4260 pos = len;
4261 }
4262
4263 if (RB_TYPE_P(sub, T_REGEXP)) {
4264 /* enc = rb_get_check(str, sub); */
4265 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4266 enc, single_byte_optimizable(str));
4267
4268 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4269 VALUE match = rb_backref_get();
4270 struct re_registers *regs = RMATCH_REGS(match);
4271 pos = rb_str_sublen(str, BEG(0));
4272 return LONG2NUM(pos);
4273 }
4274 }
4275 else {
4276 StringValue(sub);
4277 pos = rb_str_rindex(str, sub, pos);
4278 if (pos >= 0) return LONG2NUM(pos);
4279 }
4280 return Qnil;
4281}
4282
4283static long
4284rb_str_byterindex(VALUE str, VALUE sub, long pos)
4285{
4286 long len, slen;
4287 char *sbeg, *s;
4288 rb_encoding *enc;
4289
4290 enc = rb_enc_check(str, sub);
4291 if (is_broken_string(sub)) return -1;
4292 len = RSTRING_LEN(str);
4293 slen = RSTRING_LEN(sub);
4294
4295 /* substring longer than string */
4296 if (len < slen) return -1;
4297 if (len - pos < slen) pos = len - slen;
4298 if (len == 0) return pos;
4299
4300 sbeg = RSTRING_PTR(str);
4301
4302 if (pos == 0) {
4303 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4304 return 0;
4305 else
4306 return -1;
4307 }
4308
4309 s = sbeg + pos;
4310 return str_rindex(str, sub, s, enc);
4311}
4312
4313
4314/*
4315 * call-seq:
4316 * byterindex(substring, offset = self.bytesize) -> integer or nil
4317 * byterindex(regexp, offset = self.bytesize) -> integer or nil
4318 *
4319 * Returns the \Integer byte-based index of the _last_ occurrence of the given +substring+,
4320 * or +nil+ if none found:
4321 *
4322 * 'foo'.byterindex('f') # => 0
4323 * 'foo'.byterindex('o') # => 2
4324 * 'foo'.byterindex('oo') # => 1
4325 * 'foo'.byterindex('ooo') # => nil
4326 *
4327 * Returns the \Integer byte-based index of the _last_ match for the given \Regexp +regexp+,
4328 * or +nil+ if none found:
4329 *
4330 * 'foo'.byterindex(/f/) # => 0
4331 * 'foo'.byterindex(/o/) # => 2
4332 * 'foo'.byterindex(/oo/) # => 1
4333 * 'foo'.byterindex(/ooo/) # => nil
4334 *
4335 * The _last_ match means starting at the possible last position, not
4336 * the last of longest matches.
4337 *
4338 * 'foo'.byterindex(/o+/) # => 2
4339 * $~ #=> #<MatchData "o">
4340 *
4341 * To get the last longest match, needs to combine with negative
4342 * lookbehind.
4343 *
4344 * 'foo'.byterindex(/(?<!o)o+/) # => 1
4345 * $~ #=> #<MatchData "oo">
4346 *
4347 * Or String#byteindex with negative lookforward.
4348 *
4349 * 'foo'.byteindex(/o+(?!.*o)/) # => 1
4350 * $~ #=> #<MatchData "oo">
4351 *
4352 * \Integer argument +offset+, if given and non-negative, specifies the maximum starting byte-based position in the
4353 * string to _end_ the search:
4354 *
4355 * 'foo'.byterindex('o', 0) # => nil
4356 * 'foo'.byterindex('o', 1) # => 1
4357 * 'foo'.byterindex('o', 2) # => 2
4358 * 'foo'.byterindex('o', 3) # => 2
4359 *
4360 * If +offset+ is a negative \Integer, the maximum starting position in the
4361 * string to _end_ the search is the sum of the string's length and +offset+:
4362 *
4363 * 'foo'.byterindex('o', -1) # => 2
4364 * 'foo'.byterindex('o', -2) # => 1
4365 * 'foo'.byterindex('o', -3) # => nil
4366 * 'foo'.byterindex('o', -4) # => nil
4367 *
4368 * If +offset+ does not land on character (codepoint) boundary, +IndexError+ is
4369 * raised.
4370 *
4371 * Related: String#byteindex.
4372 */
4373
4374static VALUE
4375rb_str_byterindex_m(int argc, VALUE *argv, VALUE str)
4376{
4377 VALUE sub;
4378 VALUE vpos;
4379 long pos, len = RSTRING_LEN(str);
4380
4381 if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) {
4382 pos = NUM2LONG(vpos);
4383 if (pos < 0) {
4384 pos += len;
4385 if (pos < 0) {
4386 if (RB_TYPE_P(sub, T_REGEXP)) {
4388 }
4389 return Qnil;
4390 }
4391 }
4392 if (pos > len) pos = len;
4393 }
4394 else {
4395 pos = len;
4396 }
4397
4398 if (!str_check_byte_pos(str, pos)) {
4400 "offset %ld does not land on character boundary", pos);
4401 }
4402
4403 if (RB_TYPE_P(sub, T_REGEXP)) {
4404 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4405 VALUE match = rb_backref_get();
4406 struct re_registers *regs = RMATCH_REGS(match);
4407 pos = BEG(0);
4408 return LONG2NUM(pos);
4409 }
4410 }
4411 else {
4412 StringValue(sub);
4413 pos = rb_str_byterindex(str, sub, pos);
4414 if (pos >= 0) return LONG2NUM(pos);
4415 }
4416 return Qnil;
4417}
4418
4419/*
4420 * call-seq:
4421 * string =~ regexp -> integer or nil
4422 * string =~ object -> integer or nil
4423 *
4424 * Returns the \Integer index of the first substring that matches
4425 * the given +regexp+, or +nil+ if no match found:
4426 *
4427 * 'foo' =~ /f/ # => 0
4428 * 'foo' =~ /o/ # => 1
4429 * 'foo' =~ /x/ # => nil
4430 *
4431 * Note: also updates Regexp@Special+global+variables.
4432 *
4433 * If the given +object+ is not a \Regexp, returns the value
4434 * returned by <tt>object =~ self</tt>.
4435 *
4436 * Note that <tt>string =~ regexp</tt> is different from <tt>regexp =~ string</tt>
4437 * (see Regexp#=~):
4438 *
4439 * number= nil
4440 * "no. 9" =~ /(?<number>\d+)/
4441 * number # => nil (not assigned)
4442 * /(?<number>\d+)/ =~ "no. 9"
4443 * number #=> "9"
4444 *
4445 */
4446
4447static VALUE
4448rb_str_match(VALUE x, VALUE y)
4449{
4450 switch (OBJ_BUILTIN_TYPE(y)) {
4451 case T_STRING:
4452 rb_raise(rb_eTypeError, "type mismatch: String given");
4453
4454 case T_REGEXP:
4455 return rb_reg_match(y, x);
4456
4457 default:
4458 return rb_funcall(y, idEqTilde, 1, x);
4459 }
4460}
4461
4462
4463static VALUE get_pat(VALUE);
4464
4465
4466/*
4467 * call-seq:
4468 * match(pattern, offset = 0) -> matchdata or nil
4469 * match(pattern, offset = 0) {|matchdata| ... } -> object
4470 *
4471 * Returns a \MatchData object (or +nil+) based on +self+ and the given +pattern+.
4472 *
4473 * Note: also updates Regexp@Special+global+variables.
4474 *
4475 * - Computes +regexp+ by converting +pattern+ (if not already a \Regexp).
4476 * regexp = Regexp.new(pattern)
4477 * - Computes +matchdata+, which will be either a \MatchData object or +nil+
4478 * (see Regexp#match):
4479 * matchdata = <tt>regexp.match(self)
4480 *
4481 * With no block given, returns the computed +matchdata+:
4482 *
4483 * 'foo'.match('f') # => #<MatchData "f">
4484 * 'foo'.match('o') # => #<MatchData "o">
4485 * 'foo'.match('x') # => nil
4486 *
4487 * If \Integer argument +offset+ is given, the search begins at index +offset+:
4488 *
4489 * 'foo'.match('f', 1) # => nil
4490 * 'foo'.match('o', 1) # => #<MatchData "o">
4491 *
4492 * With a block given, calls the block with the computed +matchdata+
4493 * and returns the block's return value:
4494 *
4495 * 'foo'.match(/o/) {|matchdata| matchdata } # => #<MatchData "o">
4496 * 'foo'.match(/x/) {|matchdata| matchdata } # => nil
4497 * 'foo'.match(/f/, 1) {|matchdata| matchdata } # => nil
4498 *
4499 */
4500
4501static VALUE
4502rb_str_match_m(int argc, VALUE *argv, VALUE str)
4503{
4504 VALUE re, result;
4505 if (argc < 1)
4506 rb_check_arity(argc, 1, 2);
4507 re = argv[0];
4508 argv[0] = str;
4509 result = rb_funcallv(get_pat(re), rb_intern("match"), argc, argv);
4510 if (!NIL_P(result) && rb_block_given_p()) {
4511 return rb_yield(result);
4512 }
4513 return result;
4514}
4515
4516/*
4517 * call-seq:
4518 * match?(pattern, offset = 0) -> true or false
4519 *
4520 * Returns +true+ or +false+ based on whether a match is found for +self+ and +pattern+.
4521 *
4522 * Note: does not update Regexp@Special+global+variables.
4523 *
4524 * Computes +regexp+ by converting +pattern+ (if not already a \Regexp).
4525 * regexp = Regexp.new(pattern)
4526 *
4527 * Returns +true+ if <tt>self+.match(regexp)</tt> returns a \MatchData object,
4528 * +false+ otherwise:
4529 *
4530 * 'foo'.match?(/o/) # => true
4531 * 'foo'.match?('o') # => true
4532 * 'foo'.match?(/x/) # => false
4533 *
4534 * If \Integer argument +offset+ is given, the search begins at index +offset+:
4535 * 'foo'.match?('f', 1) # => false
4536 * 'foo'.match?('o', 1) # => true
4537 *
4538 */
4539
4540static VALUE
4541rb_str_match_m_p(int argc, VALUE *argv, VALUE str)
4542{
4543 VALUE re;
4544 rb_check_arity(argc, 1, 2);
4545 re = get_pat(argv[0]);
4546 return rb_reg_match_p(re, str, argc > 1 ? NUM2LONG(argv[1]) : 0);
4547}
4548
4549enum neighbor_char {
4550 NEIGHBOR_NOT_CHAR,
4551 NEIGHBOR_FOUND,
4552 NEIGHBOR_WRAPPED
4553};
4554
4555static enum neighbor_char
4556enc_succ_char(char *p, long len, rb_encoding *enc)
4557{
4558 long i;
4559 int l;
4560
4561 if (rb_enc_mbminlen(enc) > 1) {
4562 /* wchar, trivial case */
4563 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
4564 if (!MBCLEN_CHARFOUND_P(r)) {
4565 return NEIGHBOR_NOT_CHAR;
4566 }
4567 c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1;
4568 l = rb_enc_code_to_mbclen(c, enc);
4569 if (!l) return NEIGHBOR_NOT_CHAR;
4570 if (l != len) return NEIGHBOR_WRAPPED;
4571 rb_enc_mbcput(c, p, enc);
4572 r = rb_enc_precise_mbclen(p, p + len, enc);
4573 if (!MBCLEN_CHARFOUND_P(r)) {
4574 return NEIGHBOR_NOT_CHAR;
4575 }
4576 return NEIGHBOR_FOUND;
4577 }
4578 while (1) {
4579 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
4580 p[i] = '\0';
4581 if (i < 0)
4582 return NEIGHBOR_WRAPPED;
4583 ++((unsigned char*)p)[i];
4584 l = rb_enc_precise_mbclen(p, p+len, enc);
4585 if (MBCLEN_CHARFOUND_P(l)) {
4586 l = MBCLEN_CHARFOUND_LEN(l);
4587 if (l == len) {
4588 return NEIGHBOR_FOUND;
4589 }
4590 else {
4591 memset(p+l, 0xff, len-l);
4592 }
4593 }
4594 if (MBCLEN_INVALID_P(l) && i < len-1) {
4595 long len2;
4596 int l2;
4597 for (len2 = len-1; 0 < len2; len2--) {
4598 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
4599 if (!MBCLEN_INVALID_P(l2))
4600 break;
4601 }
4602 memset(p+len2+1, 0xff, len-(len2+1));
4603 }
4604 }
4605}
4606
4607static enum neighbor_char
4608enc_pred_char(char *p, long len, rb_encoding *enc)
4609{
4610 long i;
4611 int l;
4612 if (rb_enc_mbminlen(enc) > 1) {
4613 /* wchar, trivial case */
4614 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
4615 if (!MBCLEN_CHARFOUND_P(r)) {
4616 return NEIGHBOR_NOT_CHAR;
4617 }
4618 c = rb_enc_mbc_to_codepoint(p, p + len, enc);
4619 if (!c) return NEIGHBOR_NOT_CHAR;
4620 --c;
4621 l = rb_enc_code_to_mbclen(c, enc);
4622 if (!l) return NEIGHBOR_NOT_CHAR;
4623 if (l != len) return NEIGHBOR_WRAPPED;
4624 rb_enc_mbcput(c, p, enc);
4625 r = rb_enc_precise_mbclen(p, p + len, enc);
4626 if (!MBCLEN_CHARFOUND_P(r)) {
4627 return NEIGHBOR_NOT_CHAR;
4628 }
4629 return NEIGHBOR_FOUND;
4630 }
4631 while (1) {
4632 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
4633 p[i] = '\xff';
4634 if (i < 0)
4635 return NEIGHBOR_WRAPPED;
4636 --((unsigned char*)p)[i];
4637 l = rb_enc_precise_mbclen(p, p+len, enc);
4638 if (MBCLEN_CHARFOUND_P(l)) {
4639 l = MBCLEN_CHARFOUND_LEN(l);
4640 if (l == len) {
4641 return NEIGHBOR_FOUND;
4642 }
4643 else {
4644 memset(p+l, 0, len-l);
4645 }
4646 }
4647 if (MBCLEN_INVALID_P(l) && i < len-1) {
4648 long len2;
4649 int l2;
4650 for (len2 = len-1; 0 < len2; len2--) {
4651 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
4652 if (!MBCLEN_INVALID_P(l2))
4653 break;
4654 }
4655 memset(p+len2+1, 0, len-(len2+1));
4656 }
4657 }
4658}
4659
4660/*
4661 overwrite +p+ by succeeding letter in +enc+ and returns
4662 NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
4663 When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
4664 assuming each ranges are successive, and mbclen
4665 never change in each ranges.
4666 NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
4667 character.
4668 */
4669static enum neighbor_char
4670enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
4671{
4672 enum neighbor_char ret;
4673 unsigned int c;
4674 int ctype;
4675 int range;
4676 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
4677
4678 /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
4679 int try;
4680 const int max_gaps = 1;
4681
4682 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4683 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
4684 ctype = ONIGENC_CTYPE_DIGIT;
4685 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
4686 ctype = ONIGENC_CTYPE_ALPHA;
4687 else
4688 return NEIGHBOR_NOT_CHAR;
4689
4690 MEMCPY(save, p, char, len);
4691 for (try = 0; try <= max_gaps; ++try) {
4692 ret = enc_succ_char(p, len, enc);
4693 if (ret == NEIGHBOR_FOUND) {
4694 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4695 if (rb_enc_isctype(c, ctype, enc))
4696 return NEIGHBOR_FOUND;
4697 }
4698 }
4699 MEMCPY(p, save, char, len);
4700 range = 1;
4701 while (1) {
4702 MEMCPY(save, p, char, len);
4703 ret = enc_pred_char(p, len, enc);
4704 if (ret == NEIGHBOR_FOUND) {
4705 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4706 if (!rb_enc_isctype(c, ctype, enc)) {
4707 MEMCPY(p, save, char, len);
4708 break;
4709 }
4710 }
4711 else {
4712 MEMCPY(p, save, char, len);
4713 break;
4714 }
4715 range++;
4716 }
4717 if (range == 1) {
4718 return NEIGHBOR_NOT_CHAR;
4719 }
4720
4721 if (ctype != ONIGENC_CTYPE_DIGIT) {
4722 MEMCPY(carry, p, char, len);
4723 return NEIGHBOR_WRAPPED;
4724 }
4725
4726 MEMCPY(carry, p, char, len);
4727 enc_succ_char(carry, len, enc);
4728 return NEIGHBOR_WRAPPED;
4729}
4730
4731
4732static VALUE str_succ(VALUE str);
4733
4734/*
4735 * call-seq:
4736 * succ -> new_str
4737 *
4738 * Returns the successor to +self+. The successor is calculated by
4739 * incrementing characters.
4740 *
4741 * The first character to be incremented is the rightmost alphanumeric:
4742 * or, if no alphanumerics, the rightmost character:
4743 *
4744 * 'THX1138'.succ # => "THX1139"
4745 * '<<koala>>'.succ # => "<<koalb>>"
4746 * '***'.succ # => '**+'
4747 *
4748 * The successor to a digit is another digit, "carrying" to the next-left
4749 * character for a "rollover" from 9 to 0, and prepending another digit
4750 * if necessary:
4751 *
4752 * '00'.succ # => "01"
4753 * '09'.succ # => "10"
4754 * '99'.succ # => "100"
4755 *
4756 * The successor to a letter is another letter of the same case,
4757 * carrying to the next-left character for a rollover,
4758 * and prepending another same-case letter if necessary:
4759 *
4760 * 'aa'.succ # => "ab"
4761 * 'az'.succ # => "ba"
4762 * 'zz'.succ # => "aaa"
4763 * 'AA'.succ # => "AB"
4764 * 'AZ'.succ # => "BA"
4765 * 'ZZ'.succ # => "AAA"
4766 *
4767 * The successor to a non-alphanumeric character is the next character
4768 * in the underlying character set's collating sequence,
4769 * carrying to the next-left character for a rollover,
4770 * and prepending another character if necessary:
4771 *
4772 * s = 0.chr * 3
4773 * s # => "\x00\x00\x00"
4774 * s.succ # => "\x00\x00\x01"
4775 * s = 255.chr * 3
4776 * s # => "\xFF\xFF\xFF"
4777 * s.succ # => "\x01\x00\x00\x00"
4778 *
4779 * Carrying can occur between and among mixtures of alphanumeric characters:
4780 *
4781 * s = 'zz99zz99'
4782 * s.succ # => "aaa00aa00"
4783 * s = '99zz99zz'
4784 * s.succ # => "100aa00aa"
4785 *
4786 * The successor to an empty \String is a new empty \String:
4787 *
4788 * ''.succ # => ""
4789 *
4790 * String#next is an alias for String#succ.
4791 */
4792
4793VALUE
4795{
4796 VALUE str;
4797 str = rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
4798 rb_enc_cr_str_copy_for_substr(str, orig);
4799 return str_succ(str);
4800}
4801
4802static VALUE
4803str_succ(VALUE str)
4804{
4805 rb_encoding *enc;
4806 char *sbeg, *s, *e, *last_alnum = 0;
4807 int found_alnum = 0;
4808 long l, slen;
4809 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
4810 long carry_pos = 0, carry_len = 1;
4811 enum neighbor_char neighbor = NEIGHBOR_FOUND;
4812
4813 slen = RSTRING_LEN(str);
4814 if (slen == 0) return str;
4815
4816 enc = STR_ENC_GET(str);
4817 sbeg = RSTRING_PTR(str);
4818 s = e = sbeg + slen;
4819
4820 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
4821 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
4822 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
4823 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
4824 break;
4825 }
4826 }
4827 l = rb_enc_precise_mbclen(s, e, enc);
4828 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
4829 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
4830 neighbor = enc_succ_alnum_char(s, l, enc, carry);
4831 switch (neighbor) {
4832 case NEIGHBOR_NOT_CHAR:
4833 continue;
4834 case NEIGHBOR_FOUND:
4835 return str;
4836 case NEIGHBOR_WRAPPED:
4837 last_alnum = s;
4838 break;
4839 }
4840 found_alnum = 1;
4841 carry_pos = s - sbeg;
4842 carry_len = l;
4843 }
4844 if (!found_alnum) { /* str contains no alnum */
4845 s = e;
4846 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
4847 enum neighbor_char neighbor;
4848 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
4849 l = rb_enc_precise_mbclen(s, e, enc);
4850 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
4851 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
4852 MEMCPY(tmp, s, char, l);
4853 neighbor = enc_succ_char(tmp, l, enc);
4854 switch (neighbor) {
4855 case NEIGHBOR_FOUND:
4856 MEMCPY(s, tmp, char, l);
4857 return str;
4858 break;
4859 case NEIGHBOR_WRAPPED:
4860 MEMCPY(s, tmp, char, l);
4861 break;
4862 case NEIGHBOR_NOT_CHAR:
4863 break;
4864 }
4865 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
4866 /* wrapped to \0...\0. search next valid char. */
4867 enc_succ_char(s, l, enc);
4868 }
4869 if (!rb_enc_asciicompat(enc)) {
4870 MEMCPY(carry, s, char, l);
4871 carry_len = l;
4872 }
4873 carry_pos = s - sbeg;
4874 }
4876 }
4877 RESIZE_CAPA(str, slen + carry_len);
4878 sbeg = RSTRING_PTR(str);
4879 s = sbeg + carry_pos;
4880 memmove(s + carry_len, s, slen - carry_pos);
4881 memmove(s, carry, carry_len);
4882 slen += carry_len;
4883 STR_SET_LEN(str, slen);
4884 TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
4886 return str;
4887}
4888
4889
4890/*
4891 * call-seq:
4892 * succ! -> self
4893 *
4894 * Equivalent to String#succ, but modifies +self+ in place; returns +self+.
4895 *
4896 * String#next! is an alias for String#succ!.
4897 */
4898
4899static VALUE
4900rb_str_succ_bang(VALUE str)
4901{
4902 rb_str_modify(str);
4903 str_succ(str);
4904 return str;
4905}
4906
4907static int
4908all_digits_p(const char *s, long len)
4909{
4910 while (len-- > 0) {
4911 if (!ISDIGIT(*s)) return 0;
4912 s++;
4913 }
4914 return 1;
4915}
4916
4917static int
4918str_upto_i(VALUE str, VALUE arg)
4919{
4920 rb_yield(str);
4921 return 0;
4922}
4923
4924/*
4925 * call-seq:
4926 * upto(other_string, exclusive = false) {|string| ... } -> self
4927 * upto(other_string, exclusive = false) -> new_enumerator
4928 *
4929 * With a block given, calls the block with each \String value
4930 * returned by successive calls to String#succ;
4931 * the first value is +self+, the next is <tt>self.succ</tt>, and so on;
4932 * the sequence terminates when value +other_string+ is reached;
4933 * returns +self+:
4934 *
4935 * 'a8'.upto('b6') {|s| print s, ' ' } # => "a8"
4936 * Output:
4937 *
4938 * a8 a9 b0 b1 b2 b3 b4 b5 b6
4939 *
4940 * If argument +exclusive+ is given as a truthy object, the last value is omitted:
4941 *
4942 * 'a8'.upto('b6', true) {|s| print s, ' ' } # => "a8"
4943 *
4944 * Output:
4945 *
4946 * a8 a9 b0 b1 b2 b3 b4 b5
4947 *
4948 * If +other_string+ would not be reached, does not call the block:
4949 *
4950 * '25'.upto('5') {|s| fail s }
4951 * 'aa'.upto('a') {|s| fail s }
4952 *
4953 * With no block given, returns a new \Enumerator:
4954 *
4955 * 'a8'.upto('b6') # => #<Enumerator: "a8":upto("b6")>
4956 *
4957 */
4958
4959static VALUE
4960rb_str_upto(int argc, VALUE *argv, VALUE beg)
4961{
4962 VALUE end, exclusive;
4963
4964 rb_scan_args(argc, argv, "11", &end, &exclusive);
4965 RETURN_ENUMERATOR(beg, argc, argv);
4966 return rb_str_upto_each(beg, end, RTEST(exclusive), str_upto_i, Qnil);
4967}
4968
4969VALUE
4970rb_str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE arg)
4971{
4972 VALUE current, after_end;
4973 ID succ;
4974 int n, ascii;
4975 rb_encoding *enc;
4976
4977 CONST_ID(succ, "succ");
4978 StringValue(end);
4979 enc = rb_enc_check(beg, end);
4980 ascii = (is_ascii_string(beg) && is_ascii_string(end));
4981 /* single character */
4982 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
4983 char c = RSTRING_PTR(beg)[0];
4984 char e = RSTRING_PTR(end)[0];
4985
4986 if (c > e || (excl && c == e)) return beg;
4987 for (;;) {
4988 if ((*each)(rb_enc_str_new(&c, 1, enc), arg)) break;
4989 if (!excl && c == e) break;
4990 c++;
4991 if (excl && c == e) break;
4992 }
4993 return beg;
4994 }
4995 /* both edges are all digits */
4996 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0]) &&
4997 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
4998 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
4999 VALUE b, e;
5000 int width;
5001
5002 width = RSTRING_LENINT(beg);
5003 b = rb_str_to_inum(beg, 10, FALSE);
5004 e = rb_str_to_inum(end, 10, FALSE);
5005 if (FIXNUM_P(b) && FIXNUM_P(e)) {
5006 long bi = FIX2LONG(b);
5007 long ei = FIX2LONG(e);
5008 rb_encoding *usascii = rb_usascii_encoding();
5009
5010 while (bi <= ei) {
5011 if (excl && bi == ei) break;
5012 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5013 bi++;
5014 }
5015 }
5016 else {
5017 ID op = excl ? '<' : idLE;
5018 VALUE args[2], fmt = rb_fstring_lit("%.*d");
5019
5020 args[0] = INT2FIX(width);
5021 while (rb_funcall(b, op, 1, e)) {
5022 args[1] = b;
5023 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5024 b = rb_funcallv(b, succ, 0, 0);
5025 }
5026 }
5027 return beg;
5028 }
5029 /* normal case */
5030 n = rb_str_cmp(beg, end);
5031 if (n > 0 || (excl && n == 0)) return beg;
5032
5033 after_end = rb_funcallv(end, succ, 0, 0);
5034 current = str_duplicate(rb_cString, beg);
5035 while (!rb_str_equal(current, after_end)) {
5036 VALUE next = Qnil;
5037 if (excl || !rb_str_equal(current, end))
5038 next = rb_funcallv(current, succ, 0, 0);
5039 if ((*each)(current, arg)) break;
5040 if (NIL_P(next)) break;
5041 current = next;
5042 StringValue(current);
5043 if (excl && rb_str_equal(current, end)) break;
5044 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
5045 break;
5046 }
5047
5048 return beg;
5049}
5050
5051VALUE
5052rb_str_upto_endless_each(VALUE beg, int (*each)(VALUE, VALUE), VALUE arg)
5053{
5054 VALUE current;
5055 ID succ;
5056
5057 CONST_ID(succ, "succ");
5058 /* both edges are all digits */
5059 if (is_ascii_string(beg) && ISDIGIT(RSTRING_PTR(beg)[0]) &&
5060 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
5061 VALUE b, args[2], fmt = rb_fstring_lit("%.*d");
5062 int width = RSTRING_LENINT(beg);
5063 b = rb_str_to_inum(beg, 10, FALSE);
5064 if (FIXNUM_P(b)) {
5065 long bi = FIX2LONG(b);
5066 rb_encoding *usascii = rb_usascii_encoding();
5067
5068 while (FIXABLE(bi)) {
5069 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5070 bi++;
5071 }
5072 b = LONG2NUM(bi);
5073 }
5074 args[0] = INT2FIX(width);
5075 while (1) {
5076 args[1] = b;
5077 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5078 b = rb_funcallv(b, succ, 0, 0);
5079 }
5080 }
5081 /* normal case */
5082 current = str_duplicate(rb_cString, beg);
5083 while (1) {
5084 VALUE next = rb_funcallv(current, succ, 0, 0);
5085 if ((*each)(current, arg)) break;
5086 current = next;
5087 StringValue(current);
5088 if (RSTRING_LEN(current) == 0)
5089 break;
5090 }
5091
5092 return beg;
5093}
5094
5095static int
5096include_range_i(VALUE str, VALUE arg)
5097{
5098 VALUE *argp = (VALUE *)arg;
5099 if (!rb_equal(str, *argp)) return 0;
5100 *argp = Qnil;
5101 return 1;
5102}
5103
5104VALUE
5105rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
5106{
5107 beg = rb_str_new_frozen(beg);
5108 StringValue(end);
5109 end = rb_str_new_frozen(end);
5110 if (NIL_P(val)) return Qfalse;
5111 val = rb_check_string_type(val);
5112 if (NIL_P(val)) return Qfalse;
5113 if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
5114 rb_enc_asciicompat(STR_ENC_GET(end)) &&
5115 rb_enc_asciicompat(STR_ENC_GET(val))) {
5116 const char *bp = RSTRING_PTR(beg);
5117 const char *ep = RSTRING_PTR(end);
5118 const char *vp = RSTRING_PTR(val);
5119 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
5120 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
5121 return Qfalse;
5122 else {
5123 char b = *bp;
5124 char e = *ep;
5125 char v = *vp;
5126
5127 if (ISASCII(b) && ISASCII(e) && ISASCII(v)) {
5128 if (b <= v && v < e) return Qtrue;
5129 return RBOOL(!RTEST(exclusive) && v == e);
5130 }
5131 }
5132 }
5133#if 0
5134 /* both edges are all digits */
5135 if (ISDIGIT(*bp) && ISDIGIT(*ep) &&
5136 all_digits_p(bp, RSTRING_LEN(beg)) &&
5137 all_digits_p(ep, RSTRING_LEN(end))) {
5138 /* TODO */
5139 }
5140#endif
5141 }
5142 rb_str_upto_each(beg, end, RTEST(exclusive), include_range_i, (VALUE)&val);
5143
5144 return RBOOL(NIL_P(val));
5145}
5146
5147static VALUE
5148rb_str_subpat(VALUE str, VALUE re, VALUE backref)
5149{
5150 if (rb_reg_search(re, str, 0, 0) >= 0) {
5151 VALUE match = rb_backref_get();
5152 int nth = rb_reg_backref_number(match, backref);
5153 return rb_reg_nth_match(nth, match);
5154 }
5155 return Qnil;
5156}
5157
5158static VALUE
5159rb_str_aref(VALUE str, VALUE indx)
5160{
5161 long idx;
5162
5163 if (FIXNUM_P(indx)) {
5164 idx = FIX2LONG(indx);
5165 }
5166 else if (RB_TYPE_P(indx, T_REGEXP)) {
5167 return rb_str_subpat(str, indx, INT2FIX(0));
5168 }
5169 else if (RB_TYPE_P(indx, T_STRING)) {
5170 if (rb_str_index(str, indx, 0) != -1)
5171 return str_duplicate(rb_cString, indx);
5172 return Qnil;
5173 }
5174 else {
5175 /* check if indx is Range */
5176 long beg, len = str_strlen(str, NULL);
5177 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
5178 case Qfalse:
5179 break;
5180 case Qnil:
5181 return Qnil;
5182 default:
5183 return rb_str_substr(str, beg, len);
5184 }
5185 idx = NUM2LONG(indx);
5186 }
5187
5188 return str_substr(str, idx, 1, FALSE);
5189}
5190
5191
5192/*
5193 * call-seq:
5194 * string[index] -> new_string or nil
5195 * string[start, length] -> new_string or nil
5196 * string[range] -> new_string or nil
5197 * string[regexp, capture = 0] -> new_string or nil
5198 * string[substring] -> new_string or nil
5199 *
5200 * Returns the substring of +self+ specified by the arguments.
5201 * See examples at {String Slices}[rdoc-ref:String@String+Slices].
5202 *
5203 *
5204 */
5205
5206static VALUE
5207rb_str_aref_m(int argc, VALUE *argv, VALUE str)
5208{
5209 if (argc == 2) {
5210 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5211 return rb_str_subpat(str, argv[0], argv[1]);
5212 }
5213 else {
5214 long beg = NUM2LONG(argv[0]);
5215 long len = NUM2LONG(argv[1]);
5216 return rb_str_substr(str, beg, len);
5217 }
5218 }
5219 rb_check_arity(argc, 1, 2);
5220 return rb_str_aref(str, argv[0]);
5221}
5222
5223VALUE
5225{
5226 char *ptr = RSTRING_PTR(str);
5227 long olen = RSTRING_LEN(str), nlen;
5228
5229 str_modifiable(str);
5230 if (len > olen) len = olen;
5231 nlen = olen - len;
5232 if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5233 char *oldptr = ptr;
5234 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5235 STR_SET_EMBED(str);
5236 STR_SET_EMBED_LEN(str, nlen);
5237 ptr = RSTRING(str)->as.embed.ary;
5238 memmove(ptr, oldptr + len, nlen);
5239 if (fl == STR_NOEMBED) xfree(oldptr);
5240 }
5241 else {
5242 if (!STR_SHARED_P(str)) {
5243 VALUE shared = heap_str_make_shared(rb_obj_class(str), str);
5244 rb_enc_cr_str_exact_copy(shared, str);
5245 OBJ_FREEZE(shared);
5246 }
5247 ptr = RSTRING(str)->as.heap.ptr += len;
5248 RSTRING(str)->as.heap.len = nlen;
5249 }
5250 ptr[nlen] = 0;
5252 return str;
5253}
5254
5255static void
5256rb_str_splice_0(VALUE str, long beg, long len, VALUE val)
5257{
5258 char *sptr;
5259 long slen, vlen = RSTRING_LEN(val);
5260 int cr;
5261
5262 if (beg == 0 && vlen == 0) {
5263 rb_str_drop_bytes(str, len);
5264 return;
5265 }
5266
5267 str_modify_keep_cr(str);
5268 RSTRING_GETMEM(str, sptr, slen);
5269 if (len < vlen) {
5270 /* expand string */
5271 RESIZE_CAPA(str, slen + vlen - len);
5272 sptr = RSTRING_PTR(str);
5273 }
5274
5276 cr = rb_enc_str_coderange(val);
5277 else
5279
5280 if (vlen != len) {
5281 memmove(sptr + beg + vlen,
5282 sptr + beg + len,
5283 slen - (beg + len));
5284 }
5285 if (vlen < beg && len < 0) {
5286 MEMZERO(sptr + slen, char, -len);
5287 }
5288 if (vlen > 0) {
5289 memmove(sptr + beg, RSTRING_PTR(val), vlen);
5290 }
5291 slen += vlen - len;
5292 STR_SET_LEN(str, slen);
5293 TERM_FILL(&sptr[slen], TERM_LEN(str));
5294 ENC_CODERANGE_SET(str, cr);
5295}
5296
5297void
5298rb_str_update(VALUE str, long beg, long len, VALUE val)
5299{
5300 long slen;
5301 char *p, *e;
5302 rb_encoding *enc;
5303 int singlebyte = single_byte_optimizable(str);
5304 int cr;
5305
5306 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
5307
5308 StringValue(val);
5309 enc = rb_enc_check(str, val);
5310 slen = str_strlen(str, enc); /* rb_enc_check */
5311
5312 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5313 rb_raise(rb_eIndexError, "index %ld out of string", beg);
5314 }
5315 if (beg < 0) {
5316 beg += slen;
5317 }
5318 assert(beg >= 0);
5319 assert(beg <= slen);
5320 if (len > slen - beg) {
5321 len = slen - beg;
5322 }
5323 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
5324 if (!p) p = RSTRING_END(str);
5325 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
5326 if (!e) e = RSTRING_END(str);
5327 /* error check */
5328 beg = p - RSTRING_PTR(str); /* physical position */
5329 len = e - p; /* physical length */
5330 rb_str_splice_0(str, beg, len, val);
5331 rb_enc_associate(str, enc);
5333 if (cr != ENC_CODERANGE_BROKEN)
5334 ENC_CODERANGE_SET(str, cr);
5335}
5336
5337#define rb_str_splice(str, beg, len, val) rb_str_update(str, beg, len, val)
5338
5339static void
5340rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
5341{
5342 int nth;
5343 VALUE match;
5344 long start, end, len;
5345 rb_encoding *enc;
5346 struct re_registers *regs;
5347
5348 if (rb_reg_search(re, str, 0, 0) < 0) {
5349 rb_raise(rb_eIndexError, "regexp not matched");
5350 }
5351 match = rb_backref_get();
5352 nth = rb_reg_backref_number(match, backref);
5353 regs = RMATCH_REGS(match);
5354 if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5355 rb_raise(rb_eIndexError, "index %d out of regexp", nth);
5356 }
5357 if (nth < 0) {
5358 nth += regs->num_regs;
5359 }
5360
5361 start = BEG(nth);
5362 if (start == -1) {
5363 rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
5364 }
5365 end = END(nth);
5366 len = end - start;
5367 StringValue(val);
5368 enc = rb_enc_check_str(str, val);
5369 rb_str_splice_0(str, start, len, val);
5370 rb_enc_associate(str, enc);
5371}
5372
5373static VALUE
5374rb_str_aset(VALUE str, VALUE indx, VALUE val)
5375{
5376 long idx, beg;
5377
5378 switch (TYPE(indx)) {
5379 case T_REGEXP:
5380 rb_str_subpat_set(str, indx, INT2FIX(0), val);
5381 return val;
5382
5383 case T_STRING:
5384 beg = rb_str_index(str, indx, 0);
5385 if (beg < 0) {
5386 rb_raise(rb_eIndexError, "string not matched");
5387 }
5388 beg = rb_str_sublen(str, beg);
5389 rb_str_splice(str, beg, str_strlen(indx, NULL), val);
5390 return val;
5391
5392 default:
5393 /* check if indx is Range */
5394 {
5395 long beg, len;
5396 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 2)) {
5397 rb_str_splice(str, beg, len, val);
5398 return val;
5399 }
5400 }
5401 /* FALLTHROUGH */
5402
5403 case T_FIXNUM:
5404 idx = NUM2LONG(indx);
5405 rb_str_splice(str, idx, 1, val);
5406 return val;
5407 }
5408}
5409
5410/*
5411 * call-seq:
5412 * string[index] = new_string
5413 * string[start, length] = new_string
5414 * string[range] = new_string
5415 * string[regexp, capture = 0] = new_string
5416 * string[substring] = new_string
5417 *
5418 * Replaces all, some, or none of the contents of +self+; returns +new_string+.
5419 * See {String Slices}[rdoc-ref:String@String+Slices].
5420 *
5421 * A few examples:
5422 *
5423 * s = 'foo'
5424 * s[2] = 'rtune' # => "rtune"
5425 * s # => "fortune"
5426 * s[1, 5] = 'init' # => "init"
5427 * s # => "finite"
5428 * s[3..4] = 'al' # => "al"
5429 * s # => "finale"
5430 * s[/e$/] = 'ly' # => "ly"
5431 * s # => "finally"
5432 * s['lly'] = 'ncial' # => "ncial"
5433 * s # => "financial"
5434 *
5435 * String#slice is an alias for String#[].
5436 *
5437 */
5438
5439static VALUE
5440rb_str_aset_m(int argc, VALUE *argv, VALUE str)
5441{
5442 if (argc == 3) {
5443 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5444 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
5445 }
5446 else {
5447 rb_str_splice(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
5448 }
5449 return argv[2];
5450 }
5451 rb_check_arity(argc, 2, 3);
5452 return rb_str_aset(str, argv[0], argv[1]);
5453}
5454
5455/*
5456 * call-seq:
5457 * insert(index, other_string) -> self
5458 *
5459 * Inserts the given +other_string+ into +self+; returns +self+.
5460 *
5461 * If the \Integer +index+ is positive, inserts +other_string+ at offset +index+:
5462 *
5463 * 'foo'.insert(1, 'bar') # => "fbaroo"
5464 *
5465 * If the \Integer +index+ is negative, counts backward from the end of +self+
5466 * and inserts +other_string+ at offset <tt>index+1</tt>
5467 * (that is, _after_ <tt>self[index]</tt>):
5468 *
5469 * 'foo'.insert(-2, 'bar') # => "fobaro"
5470 *
5471 */
5472
5473static VALUE
5474rb_str_insert(VALUE str, VALUE idx, VALUE str2)
5475{
5476 long pos = NUM2LONG(idx);
5477
5478 if (pos == -1) {
5479 return rb_str_append(str, str2);
5480 }
5481 else if (pos < 0) {
5482 pos++;
5483 }
5484 rb_str_splice(str, pos, 0, str2);
5485 return str;
5486}
5487
5488
5489/*
5490 * call-seq:
5491 * slice!(index) -> new_string or nil
5492 * slice!(start, length) -> new_string or nil
5493 * slice!(range) -> new_string or nil
5494 * slice!(regexp, capture = 0) -> new_string or nil
5495 * slice!(substring) -> new_string or nil
5496 *
5497 * Removes and returns the substring of +self+ specified by the arguments.
5498 * See {String Slices}[rdoc-ref:String@String+Slices].
5499 *
5500 * A few examples:
5501 *
5502 * string = "This is a string"
5503 * string.slice!(2) #=> "i"
5504 * string.slice!(3..6) #=> " is "
5505 * string.slice!(/s.*t/) #=> "sa st"
5506 * string.slice!("r") #=> "r"
5507 * string #=> "Thing"
5508 *
5509 */
5510
5511static VALUE
5512rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
5513{
5514 VALUE result = Qnil;
5515 VALUE indx;
5516 long beg, len = 1;
5517 char *p;
5518
5519 rb_check_arity(argc, 1, 2);
5520 str_modify_keep_cr(str);
5521 indx = argv[0];
5522 if (RB_TYPE_P(indx, T_REGEXP)) {
5523 if (rb_reg_search(indx, str, 0, 0) < 0) return Qnil;
5524 VALUE match = rb_backref_get();
5525 struct re_registers *regs = RMATCH_REGS(match);
5526 int nth = 0;
5527 if (argc > 1 && (nth = rb_reg_backref_number(match, argv[1])) < 0) {
5528 if ((nth += regs->num_regs) <= 0) return Qnil;
5529 }
5530 else if (nth >= regs->num_regs) return Qnil;
5531 beg = BEG(nth);
5532 len = END(nth) - beg;
5533 goto subseq;
5534 }
5535 else if (argc == 2) {
5536 beg = NUM2LONG(indx);
5537 len = NUM2LONG(argv[1]);
5538 goto num_index;
5539 }
5540 else if (FIXNUM_P(indx)) {
5541 beg = FIX2LONG(indx);
5542 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
5543 if (!len) return Qnil;
5544 beg = p - RSTRING_PTR(str);
5545 goto subseq;
5546 }
5547 else if (RB_TYPE_P(indx, T_STRING)) {
5548 beg = rb_str_index(str, indx, 0);
5549 if (beg == -1) return Qnil;
5550 len = RSTRING_LEN(indx);
5551 result = str_duplicate(rb_cString, indx);
5552 goto squash;
5553 }
5554 else {
5555 switch (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 0)) {
5556 case Qnil:
5557 return Qnil;
5558 case Qfalse:
5559 beg = NUM2LONG(indx);
5560 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
5561 if (!len) return Qnil;
5562 beg = p - RSTRING_PTR(str);
5563 goto subseq;
5564 default:
5565 goto num_index;
5566 }
5567 }
5568
5569 num_index:
5570 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
5571 beg = p - RSTRING_PTR(str);
5572
5573 subseq:
5574 result = rb_str_new(RSTRING_PTR(str)+beg, len);
5575 rb_enc_cr_str_copy_for_substr(result, str);
5576
5577 squash:
5578 if (len > 0) {
5579 if (beg == 0) {
5580 rb_str_drop_bytes(str, len);
5581 }
5582 else {
5583 char *sptr = RSTRING_PTR(str);
5584 long slen = RSTRING_LEN(str);
5585 if (beg + len > slen) /* pathological check */
5586 len = slen - beg;
5587 memmove(sptr + beg,
5588 sptr + beg + len,
5589 slen - (beg + len));
5590 slen -= len;
5591 STR_SET_LEN(str, slen);
5592 TERM_FILL(&sptr[slen], TERM_LEN(str));
5593 }
5594 }
5595 return result;
5596}
5597
5598static VALUE
5599get_pat(VALUE pat)
5600{
5601 VALUE val;
5602
5603 switch (OBJ_BUILTIN_TYPE(pat)) {
5604 case T_REGEXP:
5605 return pat;
5606
5607 case T_STRING:
5608 break;
5609
5610 default:
5611 val = rb_check_string_type(pat);
5612 if (NIL_P(val)) {
5613 Check_Type(pat, T_REGEXP);
5614 }
5615 pat = val;
5616 }
5617
5618 return rb_reg_regcomp(pat);
5619}
5620
5621static VALUE
5622get_pat_quoted(VALUE pat, int check)
5623{
5624 VALUE val;
5625
5626 switch (OBJ_BUILTIN_TYPE(pat)) {
5627 case T_REGEXP:
5628 return pat;
5629
5630 case T_STRING:
5631 break;
5632
5633 default:
5634 val = rb_check_string_type(pat);
5635 if (NIL_P(val)) {
5636 Check_Type(pat, T_REGEXP);
5637 }
5638 pat = val;
5639 }
5640 if (check && is_broken_string(pat)) {
5641 rb_exc_raise(rb_reg_check_preprocess(pat));
5642 }
5643 return pat;
5644}
5645
5646static long
5647rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
5648{
5649 if (BUILTIN_TYPE(pat) == T_STRING) {
5650 pos = rb_strseq_index(str, pat, pos, 1);
5651 if (set_backref_str) {
5652 if (pos >= 0) {
5653 str = rb_str_new_frozen_String(str);
5654 rb_backref_set_string(str, pos, RSTRING_LEN(pat));
5655 }
5656 else {
5658 }
5659 }
5660 return pos;
5661 }
5662 else {
5663 return rb_reg_search0(pat, str, pos, 0, set_backref_str);
5664 }
5665}
5666
5667
5668/*
5669 * call-seq:
5670 * sub!(pattern, replacement) -> self or nil
5671 * sub!(pattern) {|match| ... } -> self or nil
5672 *
5673 * Returns +self+ with only the first occurrence
5674 * (not all occurrences) of the given +pattern+ replaced.
5675 *
5676 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
5677 *
5678 * Related: String#sub, String#gsub, String#gsub!.
5679 *
5680 */
5681
5682static VALUE
5683rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
5684{
5685 VALUE pat, repl, hash = Qnil;
5686 int iter = 0;
5687 long plen;
5688 int min_arity = rb_block_given_p() ? 1 : 2;
5689 long beg;
5690
5691 rb_check_arity(argc, min_arity, 2);
5692 if (argc == 1) {
5693 iter = 1;
5694 }
5695 else {
5696 repl = argv[1];
5697 hash = rb_check_hash_type(argv[1]);
5698 if (NIL_P(hash)) {
5699 StringValue(repl);
5700 }
5701 }
5702
5703 pat = get_pat_quoted(argv[0], 1);
5704
5705 str_modifiable(str);
5706 beg = rb_pat_search(pat, str, 0, 1);
5707 if (beg >= 0) {
5708 rb_encoding *enc;
5709 int cr = ENC_CODERANGE(str);
5710 long beg0, end0;
5711 VALUE match, match0 = Qnil;
5712 struct re_registers *regs;
5713 char *p, *rp;
5714 long len, rlen;
5715
5716 match = rb_backref_get();
5717 regs = RMATCH_REGS(match);
5718 if (RB_TYPE_P(pat, T_STRING)) {
5719 beg0 = beg;
5720 end0 = beg0 + RSTRING_LEN(pat);
5721 match0 = pat;
5722 }
5723 else {
5724 beg0 = BEG(0);
5725 end0 = END(0);
5726 if (iter) match0 = rb_reg_nth_match(0, match);
5727 }
5728
5729 if (iter || !NIL_P(hash)) {
5730 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
5731
5732 if (iter) {
5733 repl = rb_obj_as_string(rb_yield(match0));
5734 }
5735 else {
5736 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
5737 repl = rb_obj_as_string(repl);
5738 }
5739 str_mod_check(str, p, len);
5740 rb_check_frozen(str);
5741 }
5742 else {
5743 repl = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
5744 }
5745
5746 enc = rb_enc_compatible(str, repl);
5747 if (!enc) {
5748 rb_encoding *str_enc = STR_ENC_GET(str);
5749 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
5750 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
5751 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
5752 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
5753 rb_enc_name(str_enc),
5754 rb_enc_name(STR_ENC_GET(repl)));
5755 }
5756 enc = STR_ENC_GET(repl);
5757 }
5758 rb_str_modify(str);
5759 rb_enc_associate(str, enc);
5761 int cr2 = ENC_CODERANGE(repl);
5762 if (cr2 == ENC_CODERANGE_BROKEN ||
5763 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
5765 else
5766 cr = cr2;
5767 }
5768 plen = end0 - beg0;
5769 rlen = RSTRING_LEN(repl);
5770 len = RSTRING_LEN(str);
5771 if (rlen > plen) {
5772 RESIZE_CAPA(str, len + rlen - plen);
5773 }
5774 p = RSTRING_PTR(str);
5775 if (rlen != plen) {
5776 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
5777 }
5778 rp = RSTRING_PTR(repl);
5779 memmove(p + beg0, rp, rlen);
5780 len += rlen - plen;
5781 STR_SET_LEN(str, len);
5782 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
5783 ENC_CODERANGE_SET(str, cr);
5784
5785 return str;
5786 }
5787 return Qnil;
5788}
5789
5790
5791/*
5792 * call-seq:
5793 * sub(pattern, replacement) -> new_string
5794 * sub(pattern) {|match| ... } -> new_string
5795 *
5796 * Returns a copy of +self+ with only the first occurrence
5797 * (not all occurrences) of the given +pattern+ replaced.
5798 *
5799 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
5800 *
5801 * Related: String#sub!, String#gsub, String#gsub!.
5802 *
5803 */
5804
5805static VALUE
5806rb_str_sub(int argc, VALUE *argv, VALUE str)
5807{
5808 str = str_duplicate(rb_cString, str);
5809 rb_str_sub_bang(argc, argv, str);
5810 return str;
5811}
5812
5813static VALUE
5814str_gsub(int argc, VALUE *argv, VALUE str, int bang)
5815{
5816 VALUE pat, val = Qnil, repl, match, match0 = Qnil, dest, hash = Qnil;
5817 struct re_registers *regs;
5818 long beg, beg0, end0;
5819 long offset, blen, slen, len, last;
5820 enum {STR, ITER, MAP} mode = STR;
5821 char *sp, *cp;
5822 int need_backref = -1;
5823 rb_encoding *str_enc;
5824
5825 switch (argc) {
5826 case 1:
5827 RETURN_ENUMERATOR(str, argc, argv);
5828 mode = ITER;
5829 break;
5830 case 2:
5831 repl = argv[1];
5832 hash = rb_check_hash_type(argv[1]);
5833 if (NIL_P(hash)) {
5834 StringValue(repl);
5835 }
5836 else {
5837 mode = MAP;
5838 }
5839 break;
5840 default:
5841 rb_error_arity(argc, 1, 2);
5842 }
5843
5844 pat = get_pat_quoted(argv[0], 1);
5845 beg = rb_pat_search(pat, str, 0, need_backref);
5846 if (beg < 0) {
5847 if (bang) return Qnil; /* no match, no substitution */
5848 return str_duplicate(rb_cString, str);
5849 }
5850
5851 offset = 0;
5852 blen = RSTRING_LEN(str) + 30; /* len + margin */
5853 dest = rb_str_buf_new(blen);
5854 sp = RSTRING_PTR(str);
5855 slen = RSTRING_LEN(str);
5856 cp = sp;
5857 str_enc = STR_ENC_GET(str);
5858 rb_enc_associate(dest, str_enc);
5859 ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
5860
5861 do {
5862 match = rb_backref_get();
5863 regs = RMATCH_REGS(match);
5864 if (RB_TYPE_P(pat, T_STRING)) {
5865 beg0 = beg;
5866 end0 = beg0 + RSTRING_LEN(pat);
5867 match0 = pat;
5868 }
5869 else {
5870 beg0 = BEG(0);
5871 end0 = END(0);
5872 if (mode == ITER) match0 = rb_reg_nth_match(0, match);
5873 }
5874
5875 if (mode) {
5876 if (mode == ITER) {
5877 val = rb_obj_as_string(rb_yield(match0));
5878 }
5879 else {
5880 val = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
5881 val = rb_obj_as_string(val);
5882 }
5883 str_mod_check(str, sp, slen);
5884 if (val == dest) { /* paranoid check [ruby-dev:24827] */
5885 rb_raise(rb_eRuntimeError, "block should not cheat");
5886 }
5887 }
5888 else if (need_backref) {
5889 val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
5890 if (need_backref < 0) {
5891 need_backref = val != repl;
5892 }
5893 }
5894 else {
5895 val = repl;
5896 }
5897
5898 len = beg0 - offset; /* copy pre-match substr */
5899 if (len) {
5900 rb_enc_str_buf_cat(dest, cp, len, str_enc);
5901 }
5902
5903 rb_str_buf_append(dest, val);
5904
5905 last = offset;
5906 offset = end0;
5907 if (beg0 == end0) {
5908 /*
5909 * Always consume at least one character of the input string
5910 * in order to prevent infinite loops.
5911 */
5912 if (RSTRING_LEN(str) <= end0) break;
5913 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
5914 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
5915 offset = end0 + len;
5916 }
5917 cp = RSTRING_PTR(str) + offset;
5918 if (offset > RSTRING_LEN(str)) break;
5919 beg = rb_pat_search(pat, str, offset, need_backref);
5920 } while (beg >= 0);
5921 if (RSTRING_LEN(str) > offset) {
5922 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
5923 }
5924 rb_pat_search(pat, str, last, 1);
5925 if (bang) {
5926 str_shared_replace(str, dest);
5927 }
5928 else {
5929 str = dest;
5930 }
5931
5932 return str;
5933}
5934
5935
5936/*
5937 * call-seq:
5938 * gsub!(pattern, replacement) -> self or nil
5939 * gsub!(pattern) {|match| ... } -> self or nil
5940 * gsub!(pattern) -> an_enumerator
5941 *
5942 * Performs the specified substring replacement(s) on +self+;
5943 * returns +self+ if any replacement occurred, +nil+ otherwise.
5944 *
5945 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
5946 *
5947 * Returns an Enumerator if no +replacement+ and no block given.
5948 *
5949 * Related: String#sub, String#gsub, String#sub!.
5950 *
5951 */
5952
5953static VALUE
5954rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
5955{
5956 str_modify_keep_cr(str);
5957 return str_gsub(argc, argv, str, 1);
5958}
5959
5960
5961/*
5962 * call-seq:
5963 * gsub(pattern, replacement) -> new_string
5964 * gsub(pattern) {|match| ... } -> new_string
5965 * gsub(pattern) -> enumerator
5966 *
5967 * Returns a copy of +self+ with all occurrences of the given +pattern+ replaced.
5968 *
5969 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
5970 *
5971 * Returns an Enumerator if no +replacement+ and no block given.
5972 *
5973 * Related: String#sub, String#sub!, String#gsub!.
5974 *
5975 */
5976
5977static VALUE
5978rb_str_gsub(int argc, VALUE *argv, VALUE str)
5979{
5980 return str_gsub(argc, argv, str, 0);
5981}
5982
5983
5984/*
5985 * call-seq:
5986 * replace(other_string) -> self
5987 *
5988 * Replaces the contents of +self+ with the contents of +other_string+:
5989 *
5990 * s = 'foo' # => "foo"
5991 * s.replace('bar') # => "bar"
5992 *
5993 */
5994
5995VALUE
5997{
5998 str_modifiable(str);
5999 if (str == str2) return str;
6000
6001 StringValue(str2);
6002 str_discard(str);
6003 return str_replace(str, str2);
6004}
6005
6006/*
6007 * call-seq:
6008 * clear -> self
6009 *
6010 * Removes the contents of +self+:
6011 *
6012 * s = 'foo' # => "foo"
6013 * s.clear # => ""
6014 *
6015 */
6016
6017static VALUE
6018rb_str_clear(VALUE str)
6019{
6020 str_discard(str);
6021 STR_SET_EMBED(str);
6022 STR_SET_EMBED_LEN(str, 0);
6023 RSTRING_PTR(str)[0] = 0;
6024 if (rb_enc_asciicompat(STR_ENC_GET(str)))
6026 else
6028 return str;
6029}
6030
6031/*
6032 * call-seq:
6033 * chr -> string
6034 *
6035 * Returns a string containing the first character of +self+:
6036 *
6037 * s = 'foo' # => "foo"
6038 * s.chr # => "f"
6039 *
6040 */
6041
6042static VALUE
6043rb_str_chr(VALUE str)
6044{
6045 return rb_str_substr(str, 0, 1);
6046}
6047
6048/*
6049 * call-seq:
6050 * getbyte(index) -> integer or nil
6051 *
6052 * Returns the byte at zero-based +index+ as an integer, or +nil+ if +index+ is out of range:
6053 *
6054 * s = 'abcde' # => "abcde"
6055 * s.getbyte(0) # => 97
6056 * s.getbyte(-1) # => 101
6057 * s.getbyte(5) # => nil
6058 *
6059 * Related: String#setbyte.
6060 */
6061static VALUE
6062rb_str_getbyte(VALUE str, VALUE index)
6063{
6064 long pos = NUM2LONG(index);
6065
6066 if (pos < 0)
6067 pos += RSTRING_LEN(str);
6068 if (pos < 0 || RSTRING_LEN(str) <= pos)
6069 return Qnil;
6070
6071 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
6072}
6073
6074/*
6075 * call-seq:
6076 * setbyte(index, integer) -> integer
6077 *
6078 * Sets the byte at zero-based +index+ to +integer+; returns +integer+:
6079 *
6080 * s = 'abcde' # => "abcde"
6081 * s.setbyte(0, 98) # => 98
6082 * s # => "bbcde"
6083 *
6084 * Related: String#getbyte.
6085 */
6086static VALUE
6087rb_str_setbyte(VALUE str, VALUE index, VALUE value)
6088{
6089 long pos = NUM2LONG(index);
6090 long len = RSTRING_LEN(str);
6091 char *ptr, *head, *left = 0;
6092 rb_encoding *enc;
6093 int cr = ENC_CODERANGE_UNKNOWN, width, nlen;
6094
6095 if (pos < -len || len <= pos)
6096 rb_raise(rb_eIndexError, "index %ld out of string", pos);
6097 if (pos < 0)
6098 pos += len;
6099
6100 VALUE v = rb_to_int(value);
6101 VALUE w = rb_int_and(v, INT2FIX(0xff));
6102 char byte = (char)(NUM2INT(w) & 0xFF);
6103
6104 if (!str_independent(str))
6105 str_make_independent(str);
6106 enc = STR_ENC_GET(str);
6107 head = RSTRING_PTR(str);
6108 ptr = &head[pos];
6109 if (!STR_EMBED_P(str)) {
6110 cr = ENC_CODERANGE(str);
6111 switch (cr) {
6112 case ENC_CODERANGE_7BIT:
6113 left = ptr;
6114 *ptr = byte;
6115 if (ISASCII(byte)) goto end;
6116 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6117 if (!MBCLEN_CHARFOUND_P(nlen))
6119 else
6121 goto end;
6123 left = rb_enc_left_char_head(head, ptr, head+len, enc);
6124 width = rb_enc_precise_mbclen(left, head+len, enc);
6125 *ptr = byte;
6126 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6127 if (!MBCLEN_CHARFOUND_P(nlen))
6129 else if (MBCLEN_CHARFOUND_LEN(nlen) != width || ISASCII(byte))
6131 goto end;
6132 }
6133 }
6135 *ptr = byte;
6136
6137 end:
6138 return value;
6139}
6140
6141static VALUE
6142str_byte_substr(VALUE str, long beg, long len, int empty)
6143{
6144 long n = RSTRING_LEN(str);
6145
6146 if (beg > n || len < 0) return Qnil;
6147 if (beg < 0) {
6148 beg += n;
6149 if (beg < 0) return Qnil;
6150 }
6151 if (len > n - beg)
6152 len = n - beg;
6153 if (len <= 0) {
6154 if (!empty) return Qnil;
6155 len = 0;
6156 }
6157
6158 VALUE str2 = str_subseq(str, beg, len);
6159
6160 str_enc_copy(str2, str);
6161
6162 if (RSTRING_LEN(str2) == 0) {
6163 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6165 else
6167 }
6168 else {
6169 switch (ENC_CODERANGE(str)) {
6170 case ENC_CODERANGE_7BIT:
6172 break;
6173 default:
6175 break;
6176 }
6177 }
6178
6179 return str2;
6180}
6181
6182static VALUE
6183str_byte_aref(VALUE str, VALUE indx)
6184{
6185 long idx;
6186 if (FIXNUM_P(indx)) {
6187 idx = FIX2LONG(indx);
6188 }
6189 else {
6190 /* check if indx is Range */
6191 long beg, len = RSTRING_LEN(str);
6192
6193 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
6194 case Qfalse:
6195 break;
6196 case Qnil:
6197 return Qnil;
6198 default:
6199 return str_byte_substr(str, beg, len, TRUE);
6200 }
6201
6202 idx = NUM2LONG(indx);
6203 }
6204 return str_byte_substr(str, idx, 1, FALSE);
6205}
6206
6207/*
6208 * call-seq:
6209 * byteslice(index, length = 1) -> string or nil
6210 * byteslice(range) -> string or nil
6211 *
6212 * Returns a substring of +self+, or +nil+ if the substring cannot be constructed.
6213 *
6214 * With integer arguments +index+ and +length+ given,
6215 * returns the substring beginning at the given +index+
6216 * of the given +length+ (if possible),
6217 * or +nil+ if +length+ is negative or +index+ falls outside of +self+:
6218 *
6219 * s = '0123456789' # => "0123456789"
6220 * s.byteslice(2) # => "2"
6221 * s.byteslice(200) # => nil
6222 * s.byteslice(4, 3) # => "456"
6223 * s.byteslice(4, 30) # => "456789"
6224 * s.byteslice(4, -1) # => nil
6225 * s.byteslice(40, 2) # => nil
6226 *
6227 * In either case above, counts backwards from the end of +self+
6228 * if +index+ is negative:
6229 *
6230 * s = '0123456789' # => "0123456789"
6231 * s.byteslice(-4) # => "6"
6232 * s.byteslice(-4, 3) # => "678"
6233 *
6234 * With Range argument +range+ given, returns
6235 * <tt>byteslice(range.begin, range.size)</tt>:
6236 *
6237 * s = '0123456789' # => "0123456789"
6238 * s.byteslice(4..6) # => "456"
6239 * s.byteslice(-6..-4) # => "456"
6240 * s.byteslice(5..2) # => "" # range.size is zero.
6241 * s.byteslice(40..42) # => nil
6242 *
6243 * In all cases, a returned string has the same encoding as +self+:
6244 *
6245 * s.encoding # => #<Encoding:UTF-8>
6246 * s.byteslice(4).encoding # => #<Encoding:UTF-8>
6247 *
6248 */
6249
6250static VALUE
6251rb_str_byteslice(int argc, VALUE *argv, VALUE str)
6252{
6253 if (argc == 2) {
6254 long beg = NUM2LONG(argv[0]);
6255 long len = NUM2LONG(argv[1]);
6256 return str_byte_substr(str, beg, len, TRUE);
6257 }
6258 rb_check_arity(argc, 1, 2);
6259 return str_byte_aref(str, argv[0]);
6260}
6261
6262/*
6263 * call-seq:
6264 * bytesplice(index, length, str) -> string
6265 * bytesplice(range, str) -> string
6266 *
6267 * Replaces some or all of the content of +self+ with +str+, and returns +self+.
6268 * The portion of the string affected is determined using
6269 * the same criteria as String#byteslice, except that +length+ cannot be omitted.
6270 * If the replacement string is not the same length as the text it is replacing,
6271 * the string will be adjusted accordingly.
6272 * The form that take an Integer will raise an IndexError if the value is out
6273 * of range; the Range form will raise a RangeError.
6274 * If the beginning or ending offset does not land on character (codepoint)
6275 * boundary, an IndexError will be raised.
6276 */
6277
6278static VALUE
6279rb_str_bytesplice(int argc, VALUE *argv, VALUE str)
6280{
6281 long beg, end, len, slen;
6282 VALUE val;
6283 rb_encoding *enc;
6284 int cr;
6285
6286 rb_check_arity(argc, 2, 3);
6287 if (argc == 2) {
6288 if (!rb_range_beg_len(argv[0], &beg, &len, RSTRING_LEN(str), 2)) {
6289 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6290 rb_builtin_class_name(argv[0]));
6291 }
6292 val = argv[1];
6293 }
6294 else {
6295 beg = NUM2LONG(argv[0]);
6296 len = NUM2LONG(argv[1]);
6297 val = argv[2];
6298 }
6299 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
6300 slen = RSTRING_LEN(str);
6301 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
6302 rb_raise(rb_eIndexError, "index %ld out of string", beg);
6303 }
6304 if (beg < 0) {
6305 beg += slen;
6306 }
6307 assert(beg >= 0);
6308 assert(beg <= slen);
6309 if (len > slen - beg) {
6310 len = slen - beg;
6311 }
6312 end = beg + len;
6313 if (!str_check_byte_pos(str, beg)) {
6315 "offset %ld does not land on character boundary", beg);
6316 }
6317 if (!str_check_byte_pos(str, end)) {
6319 "offset %ld does not land on character boundary", end);
6320 }
6321 StringValue(val);
6322 enc = rb_enc_check(str, val);
6323 str_modify_keep_cr(str);
6324 rb_str_splice_0(str, beg, len, val);
6325 rb_enc_associate(str, enc);
6327 if (cr != ENC_CODERANGE_BROKEN)
6328 ENC_CODERANGE_SET(str, cr);
6329 return str;
6330}
6331
6332/*
6333 * call-seq:
6334 * reverse -> string
6335 *
6336 * Returns a new string with the characters from +self+ in reverse order.
6337 *
6338 * 'stressed'.reverse # => "desserts"
6339 *
6340 */
6341
6342static VALUE
6343rb_str_reverse(VALUE str)
6344{
6345 rb_encoding *enc;
6346 VALUE rev;
6347 char *s, *e, *p;
6348 int cr;
6349
6350 if (RSTRING_LEN(str) <= 1) return str_duplicate(rb_cString, str);
6351 enc = STR_ENC_GET(str);
6352 rev = rb_str_new(0, RSTRING_LEN(str));
6353 s = RSTRING_PTR(str); e = RSTRING_END(str);
6354 p = RSTRING_END(rev);
6355 cr = ENC_CODERANGE(str);
6356
6357 if (RSTRING_LEN(str) > 1) {
6358 if (single_byte_optimizable(str)) {
6359 while (s < e) {
6360 *--p = *s++;
6361 }
6362 }
6363 else if (cr == ENC_CODERANGE_VALID) {
6364 while (s < e) {
6365 int clen = rb_enc_fast_mbclen(s, e, enc);
6366
6367 p -= clen;
6368 memcpy(p, s, clen);
6369 s += clen;
6370 }
6371 }
6372 else {
6373 cr = rb_enc_asciicompat(enc) ?
6375 while (s < e) {
6376 int clen = rb_enc_mbclen(s, e, enc);
6377
6378 if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN;
6379 p -= clen;
6380 memcpy(p, s, clen);
6381 s += clen;
6382 }
6383 }
6384 }
6385 STR_SET_LEN(rev, RSTRING_LEN(str));
6386 str_enc_copy(rev, str);
6387 ENC_CODERANGE_SET(rev, cr);
6388
6389 return rev;
6390}
6391
6392
6393/*
6394 * call-seq:
6395 * reverse! -> self
6396 *
6397 * Returns +self+ with its characters reversed:
6398 *
6399 * s = 'stressed'
6400 * s.reverse! # => "desserts"
6401 * s # => "desserts"
6402 *
6403 */
6404
6405static VALUE
6406rb_str_reverse_bang(VALUE str)
6407{
6408 if (RSTRING_LEN(str) > 1) {
6409 if (single_byte_optimizable(str)) {
6410 char *s, *e, c;
6411
6412 str_modify_keep_cr(str);
6413 s = RSTRING_PTR(str);
6414 e = RSTRING_END(str) - 1;
6415 while (s < e) {
6416 c = *s;
6417 *s++ = *e;
6418 *e-- = c;
6419 }
6420 }
6421 else {
6422 str_shared_replace(str, rb_str_reverse(str));
6423 }
6424 }
6425 else {
6426 str_modify_keep_cr(str);
6427 }
6428 return str;
6429}
6430
6431
6432/*
6433 * call-seq:
6434 * include? other_string -> true or false
6435 *
6436 * Returns +true+ if +self+ contains +other_string+, +false+ otherwise:
6437 *
6438 * s = 'foo'
6439 * s.include?('f') # => true
6440 * s.include?('fo') # => true
6441 * s.include?('food') # => false
6442 *
6443 */
6444
6445VALUE
6446rb_str_include(VALUE str, VALUE arg)
6447{
6448 long i;
6449
6450 StringValue(arg);
6451 i = rb_str_index(str, arg, 0);
6452
6453 return RBOOL(i != -1);
6454}
6455
6456
6457/*
6458 * call-seq:
6459 * to_i(base = 10) -> integer
6460 *
6461 * Returns the result of interpreting leading characters in +self+
6462 * as an integer in the given +base+ (which must be in (0, 2..36)):
6463 *
6464 * '123456'.to_i # => 123456
6465 * '123def'.to_i(16) # => 1195503
6466 *
6467 * With +base+ zero, string +object+ may contain leading characters
6468 * to specify the actual base:
6469 *
6470 * '123def'.to_i(0) # => 123
6471 * '0123def'.to_i(0) # => 83
6472 * '0b123def'.to_i(0) # => 1
6473 * '0o123def'.to_i(0) # => 83
6474 * '0d123def'.to_i(0) # => 123
6475 * '0x123def'.to_i(0) # => 1195503
6476 *
6477 * Characters past a leading valid number (in the given +base+) are ignored:
6478 *
6479 * '12.345'.to_i # => 12
6480 * '12345'.to_i(2) # => 1
6481 *
6482 * Returns zero if there is no leading valid number:
6483 *
6484 * 'abcdef'.to_i # => 0
6485 * '2'.to_i(2) # => 0
6486 *
6487 */
6488
6489static VALUE
6490rb_str_to_i(int argc, VALUE *argv, VALUE str)
6491{
6492 int base = 10;
6493
6494 if (rb_check_arity(argc, 0, 1) && (base = NUM2INT(argv[0])) < 0) {
6495 rb_raise(rb_eArgError, "invalid radix %d", base);
6496 }
6497 return rb_str_to_inum(str, base, FALSE);
6498}
6499
6500
6501/*
6502 * call-seq:
6503 * to_f -> float
6504 *
6505 * Returns the result of interpreting leading characters in +self+ as a Float:
6506 *
6507 * '3.14159'.to_f # => 3.14159
6508 '1.234e-2'.to_f # => 0.01234
6509 *
6510 * Characters past a leading valid number (in the given +base+) are ignored:
6511 *
6512 * '3.14 (pi to two places)'.to_f # => 3.14
6513 *
6514 * Returns zero if there is no leading valid number:
6515 *
6516 * 'abcdef'.to_f # => 0.0
6517 *
6518 */
6519
6520static VALUE
6521rb_str_to_f(VALUE str)
6522{
6523 return DBL2NUM(rb_str_to_dbl(str, FALSE));
6524}
6525
6526
6527/*
6528 * call-seq:
6529 * to_s -> self or string
6530 *
6531 * Returns +self+ if +self+ is a \String,
6532 * or +self+ converted to a \String if +self+ is a subclass of \String.
6533 *
6534 * String#to_str is an alias for String#to_s.
6535 *
6536 */
6537
6538static VALUE
6539rb_str_to_s(VALUE str)
6540{
6541 if (rb_obj_class(str) != rb_cString) {
6542 return str_duplicate(rb_cString, str);
6543 }
6544 return str;
6545}
6546
6547#if 0
6548static void
6549str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
6550{
6551 char s[RUBY_MAX_CHAR_LEN];
6552 int n = rb_enc_codelen(c, enc);
6553
6554 rb_enc_mbcput(c, s, enc);
6555 rb_enc_str_buf_cat(str, s, n, enc);
6556}
6557#endif
6558
6559#define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
6560
6561int
6562rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
6563{
6564 char buf[CHAR_ESC_LEN + 1];
6565 int l;
6566
6567#if SIZEOF_INT > 4
6568 c &= 0xffffffff;
6569#endif
6570 if (unicode_p) {
6571 if (c < 0x7F && ISPRINT(c)) {
6572 snprintf(buf, CHAR_ESC_LEN, "%c", c);
6573 }
6574 else if (c < 0x10000) {
6575 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
6576 }
6577 else {
6578 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
6579 }
6580 }
6581 else {
6582 if (c < 0x100) {
6583 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
6584 }
6585 else {
6586 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
6587 }
6588 }
6589 l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */
6590 rb_str_buf_cat(result, buf, l);
6591 return l;
6592}
6593
6594const char *
6595ruby_escaped_char(int c)
6596{
6597 switch (c) {
6598 case '\0': return "\\0";
6599 case '\n': return "\\n";
6600 case '\r': return "\\r";
6601 case '\t': return "\\t";
6602 case '\f': return "\\f";
6603 case '\013': return "\\v";
6604 case '\010': return "\\b";
6605 case '\007': return "\\a";
6606 case '\033': return "\\e";
6607 case '\x7f': return "\\c?";
6608 }
6609 return NULL;
6610}
6611
6612VALUE
6613rb_str_escape(VALUE str)
6614{
6615 int encidx = ENCODING_GET(str);
6616 rb_encoding *enc = rb_enc_from_index(encidx);
6617 const char *p = RSTRING_PTR(str);
6618 const char *pend = RSTRING_END(str);
6619 const char *prev = p;
6620 char buf[CHAR_ESC_LEN + 1];
6621 VALUE result = rb_str_buf_new(0);
6622 int unicode_p = rb_enc_unicode_p(enc);
6623 int asciicompat = rb_enc_asciicompat(enc);
6624
6625 while (p < pend) {
6626 unsigned int c;
6627 const char *cc;
6628 int n = rb_enc_precise_mbclen(p, pend, enc);
6629 if (!MBCLEN_CHARFOUND_P(n)) {
6630 if (p > prev) str_buf_cat(result, prev, p - prev);
6631 n = rb_enc_mbminlen(enc);
6632 if (pend < p + n)
6633 n = (int)(pend - p);
6634 while (n--) {
6635 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
6636 str_buf_cat(result, buf, strlen(buf));
6637 prev = ++p;
6638 }
6639 continue;
6640 }
6641 n = MBCLEN_CHARFOUND_LEN(n);
6642 c = rb_enc_mbc_to_codepoint(p, pend, enc);
6643 p += n;
6644 cc = ruby_escaped_char(c);
6645 if (cc) {
6646 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6647 str_buf_cat(result, cc, strlen(cc));
6648 prev = p;
6649 }
6650 else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) {
6651 }
6652 else {
6653 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6654 rb_str_buf_cat_escaped_char(result, c, unicode_p);
6655 prev = p;
6656 }
6657 }
6658 if (p > prev) str_buf_cat(result, prev, p - prev);
6659 ENCODING_CODERANGE_SET(result, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
6660
6661 return result;
6662}
6663
6664/*
6665 * call-seq:
6666 * inspect -> string
6667 *
6668 * Returns a printable version of +self+, enclosed in double-quotes,
6669 * and with special characters escaped:
6670 *
6671 * s = "foo\tbar\tbaz\n"
6672 * s.inspect
6673 * # => "\"foo\\tbar\\tbaz\\n\""
6674 *
6675 */
6676
6677VALUE
6679{
6680 int encidx = ENCODING_GET(str);
6681 rb_encoding *enc = rb_enc_from_index(encidx);
6682 const char *p, *pend, *prev;
6683 char buf[CHAR_ESC_LEN + 1];
6684 VALUE result = rb_str_buf_new(0);
6685 rb_encoding *resenc = rb_default_internal_encoding();
6686 int unicode_p = rb_enc_unicode_p(enc);
6687 int asciicompat = rb_enc_asciicompat(enc);
6688
6689 if (resenc == NULL) resenc = rb_default_external_encoding();
6690 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
6691 rb_enc_associate(result, resenc);
6692 str_buf_cat2(result, "\"");
6693
6694 p = RSTRING_PTR(str); pend = RSTRING_END(str);
6695 prev = p;
6696 while (p < pend) {
6697 unsigned int c, cc;
6698 int n;
6699
6700 n = rb_enc_precise_mbclen(p, pend, enc);
6701 if (!MBCLEN_CHARFOUND_P(n)) {
6702 if (p > prev) str_buf_cat(result, prev, p - prev);
6703 n = rb_enc_mbminlen(enc);
6704 if (pend < p + n)
6705 n = (int)(pend - p);
6706 while (n--) {
6707 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
6708 str_buf_cat(result, buf, strlen(buf));
6709 prev = ++p;
6710 }
6711 continue;
6712 }
6713 n = MBCLEN_CHARFOUND_LEN(n);
6714 c = rb_enc_mbc_to_codepoint(p, pend, enc);
6715 p += n;
6716 if ((asciicompat || unicode_p) &&
6717 (c == '"'|| c == '\\' ||
6718 (c == '#' &&
6719 p < pend &&
6720 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
6721 (cc = rb_enc_codepoint(p,pend,enc),
6722 (cc == '$' || cc == '@' || cc == '{'))))) {
6723 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6724 str_buf_cat2(result, "\\");
6725 if (asciicompat || enc == resenc) {
6726 prev = p - n;
6727 continue;
6728 }
6729 }
6730 switch (c) {
6731 case '\n': cc = 'n'; break;
6732 case '\r': cc = 'r'; break;
6733 case '\t': cc = 't'; break;
6734 case '\f': cc = 'f'; break;
6735 case '\013': cc = 'v'; break;
6736 case '\010': cc = 'b'; break;
6737 case '\007': cc = 'a'; break;
6738 case 033: cc = 'e'; break;
6739 default: cc = 0; break;
6740 }
6741 if (cc) {
6742 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6743 buf[0] = '\\';
6744 buf[1] = (char)cc;
6745 str_buf_cat(result, buf, 2);
6746 prev = p;
6747 continue;
6748 }
6749 /* The special casing of 0x85 (NEXT_LINE) here is because
6750 * Oniguruma historically treats it as printable, but it
6751 * doesn't match the print POSIX bracket class or character
6752 * property in regexps.
6753 *
6754 * See Ruby Bug #16842 for details:
6755 * https://bugs.ruby-lang.org/issues/16842
6756 */
6757 if ((enc == resenc && rb_enc_isprint(c, enc) && c != 0x85) ||
6758 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
6759 continue;
6760 }
6761 else {
6762 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6763 rb_str_buf_cat_escaped_char(result, c, unicode_p);
6764 prev = p;
6765 continue;
6766 }
6767 }
6768 if (p > prev) str_buf_cat(result, prev, p - prev);
6769 str_buf_cat2(result, "\"");
6770
6771 return result;
6772}
6773
6774#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
6775
6776/*
6777 * call-seq:
6778 * dump -> string
6779 *
6780 * Returns a printable version of +self+, enclosed in double-quotes,
6781 * with special characters escaped, and with non-printing characters
6782 * replaced by hexadecimal notation:
6783 *
6784 * "hello \n ''".dump # => "\"hello \\n ''\""
6785 * "\f\x00\xff\\\"".dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
6786 *
6787 * Related: String#undump (inverse of String#dump).
6788 *
6789 */
6790
6791VALUE
6793{
6794 int encidx = rb_enc_get_index(str);
6795 rb_encoding *enc = rb_enc_from_index(encidx);
6796 long len;
6797 const char *p, *pend;
6798 char *q, *qend;
6799 VALUE result;
6800 int u8 = (encidx == rb_utf8_encindex());
6801 static const char nonascii_suffix[] = ".dup.force_encoding(\"%s\")";
6802
6803 len = 2; /* "" */
6804 if (!rb_enc_asciicompat(enc)) {
6805 len += strlen(nonascii_suffix) - rb_strlen_lit("%s");
6806 len += strlen(enc->name);
6807 }
6808
6809 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
6810 while (p < pend) {
6811 int clen;
6812 unsigned char c = *p++;
6813
6814 switch (c) {
6815 case '"': case '\\':
6816 case '\n': case '\r':
6817 case '\t': case '\f':
6818 case '\013': case '\010': case '\007': case '\033':
6819 clen = 2;
6820 break;
6821
6822 case '#':
6823 clen = IS_EVSTR(p, pend) ? 2 : 1;
6824 break;
6825
6826 default:
6827 if (ISPRINT(c)) {
6828 clen = 1;
6829 }
6830 else {
6831 if (u8 && c > 0x7F) { /* \u notation */
6832 int n = rb_enc_precise_mbclen(p-1, pend, enc);
6833 if (MBCLEN_CHARFOUND_P(n)) {
6834 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
6835 if (cc <= 0xFFFF)
6836 clen = 6; /* \uXXXX */
6837 else if (cc <= 0xFFFFF)
6838 clen = 9; /* \u{XXXXX} */
6839 else
6840 clen = 10; /* \u{XXXXXX} */
6841 p += MBCLEN_CHARFOUND_LEN(n)-1;
6842 break;
6843 }
6844 }
6845 clen = 4; /* \xNN */
6846 }
6847 break;
6848 }
6849
6850 if (clen > LONG_MAX - len) {
6851 rb_raise(rb_eRuntimeError, "string size too big");
6852 }
6853 len += clen;
6854 }
6855
6856 result = rb_str_new(0, len);
6857 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
6858 q = RSTRING_PTR(result); qend = q + len + 1;
6859
6860 *q++ = '"';
6861 while (p < pend) {
6862 unsigned char c = *p++;
6863
6864 if (c == '"' || c == '\\') {
6865 *q++ = '\\';
6866 *q++ = c;
6867 }
6868 else if (c == '#') {
6869 if (IS_EVSTR(p, pend)) *q++ = '\\';
6870 *q++ = '#';
6871 }
6872 else if (c == '\n') {
6873 *q++ = '\\';
6874 *q++ = 'n';
6875 }
6876 else if (c == '\r') {
6877 *q++ = '\\';
6878 *q++ = 'r';
6879 }
6880 else if (c == '\t') {
6881 *q++ = '\\';
6882 *q++ = 't';
6883 }
6884 else if (c == '\f') {
6885 *q++ = '\\';
6886 *q++ = 'f';
6887 }
6888 else if (c == '\013') {
6889 *q++ = '\\';
6890 *q++ = 'v';
6891 }
6892 else if (c == '\010') {
6893 *q++ = '\\';
6894 *q++ = 'b';
6895 }
6896 else if (c == '\007') {
6897 *q++ = '\\';
6898 *q++ = 'a';
6899 }
6900 else if (c == '\033') {
6901 *q++ = '\\';
6902 *q++ = 'e';
6903 }
6904 else if (ISPRINT(c)) {
6905 *q++ = c;
6906 }
6907 else {
6908 *q++ = '\\';
6909 if (u8) {
6910 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
6911 if (MBCLEN_CHARFOUND_P(n)) {
6912 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
6913 p += n;
6914 if (cc <= 0xFFFF)
6915 snprintf(q, qend-q, "u%04X", cc); /* \uXXXX */
6916 else
6917 snprintf(q, qend-q, "u{%X}", cc); /* \u{XXXXX} or \u{XXXXXX} */
6918 q += strlen(q);
6919 continue;
6920 }
6921 }
6922 snprintf(q, qend-q, "x%02X", c);
6923 q += 3;
6924 }
6925 }
6926 *q++ = '"';
6927 *q = '\0';
6928 if (!rb_enc_asciicompat(enc)) {
6929 snprintf(q, qend-q, nonascii_suffix, enc->name);
6930 encidx = rb_ascii8bit_encindex();
6931 }
6932 /* result from dump is ASCII */
6933 rb_enc_associate_index(result, encidx);
6935 return result;
6936}
6937
6938static int
6939unescape_ascii(unsigned int c)
6940{
6941 switch (c) {
6942 case 'n':
6943 return '\n';
6944 case 'r':
6945 return '\r';
6946 case 't':
6947 return '\t';
6948 case 'f':
6949 return '\f';
6950 case 'v':
6951 return '\13';
6952 case 'b':
6953 return '\010';
6954 case 'a':
6955 return '\007';
6956 case 'e':
6957 return 033;
6958 }
6960}
6961
6962static void
6963undump_after_backslash(VALUE undumped, const char **ss, const char *s_end, rb_encoding **penc, bool *utf8, bool *binary)
6964{
6965 const char *s = *ss;
6966 unsigned int c;
6967 int codelen;
6968 size_t hexlen;
6969 unsigned char buf[6];
6970 static rb_encoding *enc_utf8 = NULL;
6971
6972 switch (*s) {
6973 case '\\':
6974 case '"':
6975 case '#':
6976 rb_str_cat(undumped, s, 1); /* cat itself */
6977 s++;
6978 break;
6979 case 'n':
6980 case 'r':
6981 case 't':
6982 case 'f':
6983 case 'v':
6984 case 'b':
6985 case 'a':
6986 case 'e':
6987 *buf = unescape_ascii(*s);
6988 rb_str_cat(undumped, (char *)buf, 1);
6989 s++;
6990 break;
6991 case 'u':
6992 if (*binary) {
6993 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
6994 }
6995 *utf8 = true;
6996 if (++s >= s_end) {
6997 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
6998 }
6999 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
7000 if (*penc != enc_utf8) {
7001 *penc = enc_utf8;
7002 rb_enc_associate(undumped, enc_utf8);
7003 }
7004 if (*s == '{') { /* handle \u{...} form */
7005 s++;
7006 for (;;) {
7007 if (s >= s_end) {
7008 rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
7009 }
7010 if (*s == '}') {
7011 s++;
7012 break;
7013 }
7014 if (ISSPACE(*s)) {
7015 s++;
7016 continue;
7017 }
7018 c = scan_hex(s, s_end-s, &hexlen);
7019 if (hexlen == 0 || hexlen > 6) {
7020 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7021 }
7022 if (c > 0x10ffff) {
7023 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
7024 }
7025 if (0xd800 <= c && c <= 0xdfff) {
7026 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7027 }
7028 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7029 rb_str_cat(undumped, (char *)buf, codelen);
7030 s += hexlen;
7031 }
7032 }
7033 else { /* handle \uXXXX form */
7034 c = scan_hex(s, 4, &hexlen);
7035 if (hexlen != 4) {
7036 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7037 }
7038 if (0xd800 <= c && c <= 0xdfff) {
7039 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7040 }
7041 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7042 rb_str_cat(undumped, (char *)buf, codelen);
7043 s += hexlen;
7044 }
7045 break;
7046 case 'x':
7047 if (*utf8) {
7048 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7049 }
7050 *binary = true;
7051 if (++s >= s_end) {
7052 rb_raise(rb_eRuntimeError, "invalid hex escape");
7053 }
7054 *buf = scan_hex(s, 2, &hexlen);
7055 if (hexlen != 2) {
7056 rb_raise(rb_eRuntimeError, "invalid hex escape");
7057 }
7058 rb_str_cat(undumped, (char *)buf, 1);
7059 s += hexlen;
7060 break;
7061 default:
7062 rb_str_cat(undumped, s-1, 2);
7063 s++;
7064 }
7065
7066 *ss = s;
7067}
7068
7069static VALUE rb_str_is_ascii_only_p(VALUE str);
7070
7071/*
7072 * call-seq:
7073 * undump -> string
7074 *
7075 * Returns an unescaped version of +self+:
7076 *
7077 * s_orig = "\f\x00\xff\\\"" # => "\f\u0000\xFF\\\""
7078 * s_dumped = s_orig.dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
7079 * s_undumped = s_dumped.undump # => "\f\u0000\xFF\\\""
7080 * s_undumped == s_orig # => true
7081 *
7082 * Related: String#dump (inverse of String#undump).
7083 *
7084 */
7085
7086static VALUE
7087str_undump(VALUE str)
7088{
7089 const char *s = RSTRING_PTR(str);
7090 const char *s_end = RSTRING_END(str);
7091 rb_encoding *enc = rb_enc_get(str);
7092 VALUE undumped = rb_enc_str_new(s, 0L, enc);
7093 bool utf8 = false;
7094 bool binary = false;
7095 int w;
7096
7098 if (rb_str_is_ascii_only_p(str) == Qfalse) {
7099 rb_raise(rb_eRuntimeError, "non-ASCII character detected");
7100 }
7101 if (!str_null_check(str, &w)) {
7102 rb_raise(rb_eRuntimeError, "string contains null byte");
7103 }
7104 if (RSTRING_LEN(str) < 2) goto invalid_format;
7105 if (*s != '"') goto invalid_format;
7106
7107 /* strip '"' at the start */
7108 s++;
7109
7110 for (;;) {
7111 if (s >= s_end) {
7112 rb_raise(rb_eRuntimeError, "unterminated dumped string");
7113 }
7114
7115 if (*s == '"') {
7116 /* epilogue */
7117 s++;
7118 if (s == s_end) {
7119 /* ascii compatible dumped string */
7120 break;
7121 }
7122 else {
7123 static const char force_encoding_suffix[] = ".force_encoding(\""; /* "\")" */
7124 static const char dup_suffix[] = ".dup";
7125 const char *encname;
7126 int encidx;
7127 ptrdiff_t size;
7128
7129 /* check separately for strings dumped by older versions */
7130 size = sizeof(dup_suffix) - 1;
7131 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
7132
7133 size = sizeof(force_encoding_suffix) - 1;
7134 if (s_end - s <= size) goto invalid_format;
7135 if (memcmp(s, force_encoding_suffix, size) != 0) goto invalid_format;
7136 s += size;
7137
7138 if (utf8) {
7139 rb_raise(rb_eRuntimeError, "dumped string contained Unicode escape but used force_encoding");
7140 }
7141
7142 encname = s;
7143 s = memchr(s, '"', s_end-s);
7144 size = s - encname;
7145 if (!s) goto invalid_format;
7146 if (s_end - s != 2) goto invalid_format;
7147 if (s[0] != '"' || s[1] != ')') goto invalid_format;
7148
7149 encidx = rb_enc_find_index2(encname, (long)size);
7150 if (encidx < 0) {
7151 rb_raise(rb_eRuntimeError, "dumped string has unknown encoding name");
7152 }
7153 rb_enc_associate_index(undumped, encidx);
7154 }
7155 break;
7156 }
7157
7158 if (*s == '\\') {
7159 s++;
7160 if (s >= s_end) {
7161 rb_raise(rb_eRuntimeError, "invalid escape");
7162 }
7163 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
7164 }
7165 else {
7166 rb_str_cat(undumped, s++, 1);
7167 }
7168 }
7169
7170 return undumped;
7171invalid_format:
7172 rb_raise(rb_eRuntimeError, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
7173}
7174
7175static void
7176rb_str_check_dummy_enc(rb_encoding *enc)
7177{
7178 if (rb_enc_dummy_p(enc)) {
7179 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
7180 rb_enc_name(enc));
7181 }
7182}
7183
7184static rb_encoding *
7185str_true_enc(VALUE str)
7186{
7187 rb_encoding *enc = STR_ENC_GET(str);
7188 rb_str_check_dummy_enc(enc);
7189 return enc;
7190}
7191
7192static OnigCaseFoldType
7193check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
7194{
7195 if (argc==0)
7196 return flags;
7197 if (argc>2)
7198 rb_raise(rb_eArgError, "too many options");
7199 if (argv[0]==sym_turkic) {
7200 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7201 if (argc==2) {
7202 if (argv[1]==sym_lithuanian)
7203 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7204 else
7205 rb_raise(rb_eArgError, "invalid second option");
7206 }
7207 }
7208 else if (argv[0]==sym_lithuanian) {
7209 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7210 if (argc==2) {
7211 if (argv[1]==sym_turkic)
7212 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7213 else
7214 rb_raise(rb_eArgError, "invalid second option");
7215 }
7216 }
7217 else if (argc>1)
7218 rb_raise(rb_eArgError, "too many options");
7219 else if (argv[0]==sym_ascii)
7220 flags |= ONIGENC_CASE_ASCII_ONLY;
7221 else if (argv[0]==sym_fold) {
7222 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
7223 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
7224 else
7225 rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
7226 }
7227 else
7228 rb_raise(rb_eArgError, "invalid option");
7229 return flags;
7230}
7231
7232static inline bool
7233case_option_single_p(OnigCaseFoldType flags, rb_encoding *enc, VALUE str)
7234{
7235 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc) == 1))
7236 return true;
7237 return !(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT;
7238}
7239
7240/* 16 should be long enough to absorb any kind of single character length increase */
7241#define CASE_MAPPING_ADDITIONAL_LENGTH 20
7242#ifndef CASEMAP_DEBUG
7243# define CASEMAP_DEBUG 0
7244#endif
7245
7246struct mapping_buffer;
7247typedef struct mapping_buffer {
7248 size_t capa;
7249 size_t used;
7250 struct mapping_buffer *next;
7251 OnigUChar space[FLEX_ARY_LEN];
7253
7254static void
7255mapping_buffer_free(void *p)
7256{
7257 mapping_buffer *previous_buffer;
7258 mapping_buffer *current_buffer = p;
7259 while (current_buffer) {
7260 previous_buffer = current_buffer;
7261 current_buffer = current_buffer->next;
7262 ruby_sized_xfree(previous_buffer, previous_buffer->capa);
7263 }
7264}
7265
7266static const rb_data_type_t mapping_buffer_type = {
7267 "mapping_buffer",
7268 {0, mapping_buffer_free,}
7269};
7270
7271static VALUE
7272rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
7273{
7274 VALUE target;
7275
7276 const OnigUChar *source_current, *source_end;
7277 int target_length = 0;
7278 VALUE buffer_anchor;
7279 mapping_buffer *current_buffer = 0;
7280 mapping_buffer **pre_buffer;
7281 size_t buffer_count = 0;
7282 int buffer_length_or_invalid;
7283
7284 if (RSTRING_LEN(source) == 0) return str_duplicate(rb_cString, source);
7285
7286 source_current = (OnigUChar*)RSTRING_PTR(source);
7287 source_end = (OnigUChar*)RSTRING_END(source);
7288
7289 buffer_anchor = TypedData_Wrap_Struct(0, &mapping_buffer_type, 0);
7290 pre_buffer = (mapping_buffer **)&DATA_PTR(buffer_anchor);
7291 while (source_current < source_end) {
7292 /* increase multiplier using buffer count to converge quickly */
7293 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7294 if (CASEMAP_DEBUG) {
7295 fprintf(stderr, "Buffer allocation, capa is %"PRIuSIZE"\n", capa); /* for tuning */
7296 }
7297 current_buffer = xmalloc(offsetof(mapping_buffer, space) + capa);
7298 *pre_buffer = current_buffer;
7299 pre_buffer = &current_buffer->next;
7300 current_buffer->next = NULL;
7301 current_buffer->capa = capa;
7302 buffer_length_or_invalid = enc->case_map(flags,
7303 &source_current, source_end,
7304 current_buffer->space,
7305 current_buffer->space+current_buffer->capa,
7306 enc);
7307 if (buffer_length_or_invalid < 0) {
7308 current_buffer = DATA_PTR(buffer_anchor);
7309 DATA_PTR(buffer_anchor) = 0;
7310 mapping_buffer_free(current_buffer);
7311 rb_raise(rb_eArgError, "input string invalid");
7312 }
7313 target_length += current_buffer->used = buffer_length_or_invalid;
7314 }
7315 if (CASEMAP_DEBUG) {
7316 fprintf(stderr, "Buffer count is %"PRIuSIZE"\n", buffer_count); /* for tuning */
7317 }
7318
7319 if (buffer_count==1) {
7320 target = rb_str_new((const char*)current_buffer->space, target_length);
7321 }
7322 else {
7323 char *target_current;
7324
7325 target = rb_str_new(0, target_length);
7326 target_current = RSTRING_PTR(target);
7327 current_buffer = DATA_PTR(buffer_anchor);
7328 while (current_buffer) {
7329 memcpy(target_current, current_buffer->space, current_buffer->used);
7330 target_current += current_buffer->used;
7331 current_buffer = current_buffer->next;
7332 }
7333 }
7334 current_buffer = DATA_PTR(buffer_anchor);
7335 DATA_PTR(buffer_anchor) = 0;
7336 mapping_buffer_free(current_buffer);
7337
7338 RB_GC_GUARD(buffer_anchor);
7339
7340 /* TODO: check about string terminator character */
7341 str_enc_copy(target, source);
7342 /*ENC_CODERANGE_SET(mapped, cr);*/
7343
7344 return target;
7345}
7346
7347static VALUE
7348rb_str_ascii_casemap(VALUE source, VALUE target, OnigCaseFoldType *flags, rb_encoding *enc)
7349{
7350 const OnigUChar *source_current, *source_end;
7351 OnigUChar *target_current, *target_end;
7352 long old_length = RSTRING_LEN(source);
7353 int length_or_invalid;
7354
7355 if (old_length == 0) return Qnil;
7356
7357 source_current = (OnigUChar*)RSTRING_PTR(source);
7358 source_end = (OnigUChar*)RSTRING_END(source);
7359 if (source == target) {
7360 target_current = (OnigUChar*)source_current;
7361 target_end = (OnigUChar*)source_end;
7362 }
7363 else {
7364 target_current = (OnigUChar*)RSTRING_PTR(target);
7365 target_end = (OnigUChar*)RSTRING_END(target);
7366 }
7367
7368 length_or_invalid = onigenc_ascii_only_case_map(flags,
7369 &source_current, source_end,
7370 target_current, target_end, enc);
7371 if (length_or_invalid < 0)
7372 rb_raise(rb_eArgError, "input string invalid");
7373 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
7374 fprintf(stderr, "problem with rb_str_ascii_casemap"
7375 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7376 rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap"
7377 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7378 }
7379
7380 str_enc_copy(target, source);
7381
7382 return target;
7383}
7384
7385static bool
7386upcase_single(VALUE str)
7387{
7388 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7389 bool modified = false;
7390
7391 while (s < send) {
7392 unsigned int c = *(unsigned char*)s;
7393
7394 if ('a' <= c && c <= 'z') {
7395 *s = 'A' + (c - 'a');
7396 modified = true;
7397 }
7398 s++;
7399 }
7400 return modified;
7401}
7402
7403/*
7404 * call-seq:
7405 * upcase!(*options) -> self or nil
7406 *
7407 * Upcases the characters in +self+;
7408 * returns +self+ if any changes were made, +nil+ otherwise:
7409 *
7410 * s = 'Hello World!' # => "Hello World!"
7411 * s.upcase! # => "HELLO WORLD!"
7412 * s # => "HELLO WORLD!"
7413 * s.upcase! # => nil
7414 *
7415 * The casing may be affected by the given +options+;
7416 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7417 *
7418 * Related: String#upcase, String#downcase, String#downcase!.
7419 *
7420 */
7421
7422static VALUE
7423rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)
7424{
7425 rb_encoding *enc;
7426 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7427
7428 flags = check_case_options(argc, argv, flags);
7429 str_modify_keep_cr(str);
7430 enc = str_true_enc(str);
7431 if (case_option_single_p(flags, enc, str)) {
7432 if (upcase_single(str))
7433 flags |= ONIGENC_CASE_MODIFIED;
7434 }
7435 else if (flags&ONIGENC_CASE_ASCII_ONLY)
7436 rb_str_ascii_casemap(str, str, &flags, enc);
7437 else
7438 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7439
7440 if (ONIGENC_CASE_MODIFIED&flags) return str;
7441 return Qnil;
7442}
7443
7444
7445/*
7446 * call-seq:
7447 * upcase(*options) -> string
7448 *
7449 * Returns a string containing the upcased characters in +self+:
7450 *
7451 * s = 'Hello World!' # => "Hello World!"
7452 * s.upcase # => "HELLO WORLD!"
7453 *
7454 * The casing may be affected by the given +options+;
7455 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7456 *
7457 * Related: String#upcase!, String#downcase, String#downcase!.
7458 *
7459 */
7460
7461static VALUE
7462rb_str_upcase(int argc, VALUE *argv, VALUE str)
7463{
7464 rb_encoding *enc;
7465 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7466 VALUE ret;
7467
7468 flags = check_case_options(argc, argv, flags);
7469 enc = str_true_enc(str);
7470 if (case_option_single_p(flags, enc, str)) {
7471 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
7472 str_enc_copy(ret, str);
7473 upcase_single(ret);
7474 }
7475 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
7476 ret = rb_str_new(0, RSTRING_LEN(str));
7477 rb_str_ascii_casemap(str, ret, &flags, enc);
7478 }
7479 else {
7480 ret = rb_str_casemap(str, &flags, enc);
7481 }
7482
7483 return ret;
7484}
7485
7486static bool
7487downcase_single(VALUE str)
7488{
7489 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7490 bool modified = false;
7491
7492 while (s < send) {
7493 unsigned int c = *(unsigned char*)s;
7494
7495 if ('A' <= c && c <= 'Z') {
7496 *s = 'a' + (c - 'A');
7497 modified = true;
7498 }
7499 s++;
7500 }
7501
7502 return modified;
7503}
7504
7505/*
7506 * call-seq:
7507 * downcase!(*options) -> self or nil
7508 *
7509 * Downcases the characters in +self+;
7510 * returns +self+ if any changes were made, +nil+ otherwise:
7511 *
7512 * s = 'Hello World!' # => "Hello World!"
7513 * s.downcase! # => "hello world!"
7514 * s # => "hello world!"
7515 * s.downcase! # => nil
7516 *
7517 * The casing may be affected by the given +options+;
7518 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7519 *
7520 * Related: String#downcase, String#upcase, String#upcase!.
7521 *
7522 */
7523
7524static VALUE
7525rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
7526{
7527 rb_encoding *enc;
7528 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
7529
7530 flags = check_case_options(argc, argv, flags);
7531 str_modify_keep_cr(str);
7532 enc = str_true_enc(str);
7533 if (case_option_single_p(flags, enc, str)) {
7534 if (downcase_single(str))
7535 flags |= ONIGENC_CASE_MODIFIED;
7536 }
7537 else if (flags&ONIGENC_CASE_ASCII_ONLY)
7538 rb_str_ascii_casemap(str, str, &flags, enc);
7539 else
7540 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7541
7542 if (ONIGENC_CASE_MODIFIED&flags) return str;
7543 return Qnil;
7544}
7545
7546
7547/*
7548 * call-seq:
7549 * downcase(*options) -> string
7550 *
7551 * Returns a string containing the downcased characters in +self+:
7552 *
7553 * s = 'Hello World!' # => "Hello World!"
7554 * s.downcase # => "hello world!"
7555 *
7556 * The casing may be affected by the given +options+;
7557 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7558 *
7559 * Related: String#downcase!, String#upcase, String#upcase!.
7560 *
7561 */
7562
7563static VALUE
7564rb_str_downcase(int argc, VALUE *argv, VALUE str)
7565{
7566 rb_encoding *enc;
7567 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
7568 VALUE ret;
7569
7570 flags = check_case_options(argc, argv, flags);
7571 enc = str_true_enc(str);
7572 if (case_option_single_p(flags, enc, str)) {
7573 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
7574 str_enc_copy(ret, str);
7575 downcase_single(ret);
7576 }
7577 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
7578 ret = rb_str_new(0, RSTRING_LEN(str));
7579 rb_str_ascii_casemap(str, ret, &flags, enc);
7580 }
7581 else {
7582 ret = rb_str_casemap(str, &flags, enc);
7583 }
7584
7585 return ret;
7586}
7587
7588
7589/*
7590 * call-seq:
7591 * capitalize!(*options) -> self or nil
7592 *
7593 * Upcases the first character in +self+;
7594 * downcases the remaining characters;
7595 * returns +self+ if any changes were made, +nil+ otherwise:
7596 *
7597 * s = 'hello World!' # => "hello World!"
7598 * s.capitalize! # => "Hello world!"
7599 * s # => "Hello world!"
7600 * s.capitalize! # => nil
7601 *
7602 * The casing may be affected by the given +options+;
7603 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7604 *
7605 * Related: String#capitalize.
7606 *
7607 */
7608
7609static VALUE
7610rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)
7611{
7612 rb_encoding *enc;
7613 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
7614
7615 flags = check_case_options(argc, argv, flags);
7616 str_modify_keep_cr(str);
7617 enc = str_true_enc(str);
7618 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
7619 if (flags&ONIGENC_CASE_ASCII_ONLY)
7620 rb_str_ascii_casemap(str, str, &flags, enc);
7621 else
7622 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7623
7624 if (ONIGENC_CASE_MODIFIED&flags) return str;
7625 return Qnil;
7626}
7627
7628
7629/*
7630 * call-seq:
7631 * capitalize(*options) -> string
7632 *
7633 * Returns a string containing the characters in +self+;
7634 * the first character is upcased;
7635 * the remaining characters are downcased:
7636 *
7637 * s = 'hello World!' # => "hello World!"
7638 * s.capitalize # => "Hello world!"
7639 *
7640 * The casing may be affected by the given +options+;
7641 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7642 *
7643 * Related: String#capitalize!.
7644 *
7645 */
7646
7647static VALUE
7648rb_str_capitalize(int argc, VALUE *argv, VALUE str)
7649{
7650 rb_encoding *enc;
7651 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
7652 VALUE ret;
7653
7654 flags = check_case_options(argc, argv, flags);
7655 enc = str_true_enc(str);
7656 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str;
7657 if (flags&ONIGENC_CASE_ASCII_ONLY) {
7658 ret = rb_str_new(0, RSTRING_LEN(str));
7659 rb_str_ascii_casemap(str, ret, &flags, enc);
7660 }
7661 else {
7662 ret = rb_str_casemap(str, &flags, enc);
7663 }
7664 return ret;
7665}
7666
7667
7668/*
7669 * call-seq:
7670 * swapcase!(*options) -> self or nil
7671 *
7672 * Upcases each lowercase character in +self+;
7673 * downcases uppercase character;
7674 * returns +self+ if any changes were made, +nil+ otherwise:
7675 *
7676 * s = 'Hello World!' # => "Hello World!"
7677 * s.swapcase! # => "hELLO wORLD!"
7678 * s # => "hELLO wORLD!"
7679 * ''.swapcase! # => nil
7680 *
7681 * The casing may be affected by the given +options+;
7682 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7683 *
7684 * Related: String#swapcase.
7685 *
7686 */
7687
7688static VALUE
7689rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)
7690{
7691 rb_encoding *enc;
7692 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
7693
7694 flags = check_case_options(argc, argv, flags);
7695 str_modify_keep_cr(str);
7696 enc = str_true_enc(str);
7697 if (flags&ONIGENC_CASE_ASCII_ONLY)
7698 rb_str_ascii_casemap(str, str, &flags, enc);
7699 else
7700 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7701
7702 if (ONIGENC_CASE_MODIFIED&flags) return str;
7703 return Qnil;
7704}
7705
7706
7707/*
7708 * call-seq:
7709 * swapcase(*options) -> string
7710 *
7711 * Returns a string containing the characters in +self+, with cases reversed;
7712 * each uppercase character is downcased;
7713 * each lowercase character is upcased:
7714 *
7715 * s = 'Hello World!' # => "Hello World!"
7716 * s.swapcase # => "hELLO wORLD!"
7717 *
7718 * The casing may be affected by the given +options+;
7719 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7720 *
7721 * Related: String#swapcase!.
7722 *
7723 */
7724
7725static VALUE
7726rb_str_swapcase(int argc, VALUE *argv, VALUE str)
7727{
7728 rb_encoding *enc;
7729 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
7730 VALUE ret;
7731
7732 flags = check_case_options(argc, argv, flags);
7733 enc = str_true_enc(str);
7734 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str_duplicate(rb_cString, str);
7735 if (flags&ONIGENC_CASE_ASCII_ONLY) {
7736 ret = rb_str_new(0, RSTRING_LEN(str));
7737 rb_str_ascii_casemap(str, ret, &flags, enc);
7738 }
7739 else {
7740 ret = rb_str_casemap(str, &flags, enc);
7741 }
7742 return ret;
7743}
7744
7745typedef unsigned char *USTR;
7746
7747struct tr {
7748 int gen;
7749 unsigned int now, max;
7750 char *p, *pend;
7751};
7752
7753static unsigned int
7754trnext(struct tr *t, rb_encoding *enc)
7755{
7756 int n;
7757
7758 for (;;) {
7759 nextpart:
7760 if (!t->gen) {
7761 if (t->p == t->pend) return -1;
7762 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
7763 t->p += n;
7764 }
7765 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
7766 t->p += n;
7767 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
7768 t->p += n;
7769 if (t->p < t->pend) {
7770 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
7771 t->p += n;
7772 if (t->now > c) {
7773 if (t->now < 0x80 && c < 0x80) {
7775 "invalid range \"%c-%c\" in string transliteration",
7776 t->now, c);
7777 }
7778 else {
7779 rb_raise(rb_eArgError, "invalid range in string transliteration");
7780 }
7781 continue; /* not reached */
7782 }
7783 t->gen = 1;
7784 t->max = c;
7785 }
7786 }
7787 return t->now;
7788 }
7789 else {
7790 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
7791 if (t->now == t->max) {
7792 t->gen = 0;
7793 goto nextpart;
7794 }
7795 }
7796 if (t->now < t->max) {
7797 return t->now;
7798 }
7799 else {
7800 t->gen = 0;
7801 return t->max;
7802 }
7803 }
7804 }
7805}
7806
7807static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
7808
7809static VALUE
7810tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
7811{
7812 const unsigned int errc = -1;
7813 unsigned int trans[256];
7814 rb_encoding *enc, *e1, *e2;
7815 struct tr trsrc, trrepl;
7816 int cflag = 0;
7817 unsigned int c, c0, last = 0;
7818 int modify = 0, i, l;
7819 unsigned char *s, *send;
7820 VALUE hash = 0;
7821 int singlebyte = single_byte_optimizable(str);
7822 int termlen;
7823 int cr;
7824
7825#define CHECK_IF_ASCII(c) \
7826 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
7827 (cr = ENC_CODERANGE_VALID) : 0)
7828
7829 StringValue(src);
7830 StringValue(repl);
7831 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
7832 if (RSTRING_LEN(repl) == 0) {
7833 return rb_str_delete_bang(1, &src, str);
7834 }
7835
7836 cr = ENC_CODERANGE(str);
7837 e1 = rb_enc_check(str, src);
7838 e2 = rb_enc_check(str, repl);
7839 if (e1 == e2) {
7840 enc = e1;
7841 }
7842 else {
7843 enc = rb_enc_check(src, repl);
7844 }
7845 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
7846 if (RSTRING_LEN(src) > 1 &&
7847 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
7848 trsrc.p + l < trsrc.pend) {
7849 cflag = 1;
7850 trsrc.p += l;
7851 }
7852 trrepl.p = RSTRING_PTR(repl);
7853 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
7854 trsrc.gen = trrepl.gen = 0;
7855 trsrc.now = trrepl.now = 0;
7856 trsrc.max = trrepl.max = 0;
7857
7858 if (cflag) {
7859 for (i=0; i<256; i++) {
7860 trans[i] = 1;
7861 }
7862 while ((c = trnext(&trsrc, enc)) != errc) {
7863 if (c < 256) {
7864 trans[c] = errc;
7865 }
7866 else {
7867 if (!hash) hash = rb_hash_new();
7868 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
7869 }
7870 }
7871 while ((c = trnext(&trrepl, enc)) != errc)
7872 /* retrieve last replacer */;
7873 last = trrepl.now;
7874 for (i=0; i<256; i++) {
7875 if (trans[i] != errc) {
7876 trans[i] = last;
7877 }
7878 }
7879 }
7880 else {
7881 unsigned int r;
7882
7883 for (i=0; i<256; i++) {
7884 trans[i] = errc;
7885 }
7886 while ((c = trnext(&trsrc, enc)) != errc) {
7887 r = trnext(&trrepl, enc);
7888 if (r == errc) r = trrepl.now;
7889 if (c < 256) {
7890 trans[c] = r;
7891 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
7892 }
7893 else {
7894 if (!hash) hash = rb_hash_new();
7895 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
7896 }
7897 }
7898 }
7899
7900 if (cr == ENC_CODERANGE_VALID && rb_enc_asciicompat(e1))
7901 cr = ENC_CODERANGE_7BIT;
7902 str_modify_keep_cr(str);
7903 s = (unsigned char *)RSTRING_PTR(str); send = (unsigned char *)RSTRING_END(str);
7904 termlen = rb_enc_mbminlen(enc);
7905 if (sflag) {
7906 int clen, tlen;
7907 long offset, max = RSTRING_LEN(str);
7908 unsigned int save = -1;
7909 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
7910
7911 while (s < send) {
7912 int may_modify = 0;
7913
7914 c0 = c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, e1);
7915 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
7916
7917 s += clen;
7918 if (c < 256) {
7919 c = trans[c];
7920 }
7921 else if (hash) {
7922 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
7923 if (NIL_P(tmp)) {
7924 if (cflag) c = last;
7925 else c = errc;
7926 }
7927 else if (cflag) c = errc;
7928 else c = NUM2INT(tmp);
7929 }
7930 else {
7931 c = errc;
7932 }
7933 if (c != (unsigned int)-1) {
7934 if (save == c) {
7935 CHECK_IF_ASCII(c);
7936 continue;
7937 }
7938 save = c;
7939 tlen = rb_enc_codelen(c, enc);
7940 modify = 1;
7941 }
7942 else {
7943 save = -1;
7944 c = c0;
7945 if (enc != e1) may_modify = 1;
7946 }
7947 if ((offset = t - buf) + tlen > max) {
7948 size_t MAYBE_UNUSED(old) = max + termlen;
7949 max = offset + tlen + (send - s);
7950 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
7951 t = buf + offset;
7952 }
7953 rb_enc_mbcput(c, t, enc);
7954 if (may_modify && memcmp(s, t, tlen) != 0) {
7955 modify = 1;
7956 }
7957 CHECK_IF_ASCII(c);
7958 t += tlen;
7959 }
7960 if (!STR_EMBED_P(str)) {
7961 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
7962 }
7963 TERM_FILL((char *)t, termlen);
7964 RSTRING(str)->as.heap.ptr = (char *)buf;
7965 RSTRING(str)->as.heap.len = t - buf;
7966 STR_SET_NOEMBED(str);
7967 RSTRING(str)->as.heap.aux.capa = max;
7968 }
7969 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
7970 while (s < send) {
7971 c = (unsigned char)*s;
7972 if (trans[c] != errc) {
7973 if (!cflag) {
7974 c = trans[c];
7975 *s = c;
7976 modify = 1;
7977 }
7978 else {
7979 *s = last;
7980 modify = 1;
7981 }
7982 }
7983 CHECK_IF_ASCII(c);
7984 s++;
7985 }
7986 }
7987 else {
7988 int clen, tlen;
7989 long offset, max = (long)((send - s) * 1.2);
7990 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
7991
7992 while (s < send) {
7993 int may_modify = 0;
7994 c0 = c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, e1);
7995 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
7996
7997 if (c < 256) {
7998 c = trans[c];
7999 }
8000 else if (hash) {
8001 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8002 if (NIL_P(tmp)) {
8003 if (cflag) c = last;
8004 else c = errc;
8005 }
8006 else if (cflag) c = errc;
8007 else c = NUM2INT(tmp);
8008 }
8009 else {
8010 c = cflag ? last : errc;
8011 }
8012 if (c != errc) {
8013 tlen = rb_enc_codelen(c, enc);
8014 modify = 1;
8015 }
8016 else {
8017 c = c0;
8018 if (enc != e1) may_modify = 1;
8019 }
8020 if ((offset = t - buf) + tlen > max) {
8021 size_t MAYBE_UNUSED(old) = max + termlen;
8022 max = offset + tlen + (long)((send - s) * 1.2);
8023 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8024 t = buf + offset;
8025 }
8026 if (s != t) {
8027 rb_enc_mbcput(c, t, enc);
8028 if (may_modify && memcmp(s, t, tlen) != 0) {
8029 modify = 1;
8030 }
8031 }
8032 CHECK_IF_ASCII(c);
8033 s += clen;
8034 t += tlen;
8035 }
8036 if (!STR_EMBED_P(str)) {
8037 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8038 }
8039 TERM_FILL((char *)t, termlen);
8040 RSTRING(str)->as.heap.ptr = (char *)buf;
8041 RSTRING(str)->as.heap.len = t - buf;
8042 STR_SET_NOEMBED(str);
8043 RSTRING(str)->as.heap.aux.capa = max;
8044 }
8045
8046 if (modify) {
8047 if (cr != ENC_CODERANGE_BROKEN)
8048 ENC_CODERANGE_SET(str, cr);
8049 rb_enc_associate(str, enc);
8050 return str;
8051 }
8052 return Qnil;
8053}
8054
8055
8056/*
8057 * call-seq:
8058 * tr!(selector, replacements) -> self or nil
8059 *
8060 * Like String#tr, but modifies +self+ in place.
8061 * Returns +self+ if any changes were made, +nil+ otherwise.
8062 *
8063 */
8064
8065static VALUE
8066rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
8067{
8068 return tr_trans(str, src, repl, 0);
8069}
8070
8071
8072/*
8073 * call-seq:
8074 * tr(selector, replacements) -> new_string
8075 *
8076 * Returns a copy of +self+ with each character specified by string +selector+
8077 * translated to the corresponding character in string +replacements+.
8078 * The correspondence is _positional_:
8079 *
8080 * - Each occurrence of the first character specified by +selector+
8081 * is translated to the first character in +replacements+.
8082 * - Each occurrence of the second character specified by +selector+
8083 * is translated to the second character in +replacements+.
8084 * - And so on.
8085 *
8086 * Example:
8087 *
8088 * 'hello'.tr('el', 'ip') #=> "hippo"
8089 *
8090 * If +replacements+ is shorter than +selector+,
8091 * it is implicitly padded with its own last character:
8092 *
8093 * 'hello'.tr('aeiou', '-') # => "h-ll-"
8094 * 'hello'.tr('aeiou', 'AA-') # => "hAll-"
8095 *
8096 * Arguments +selector+ and +replacements+ must be valid character selectors
8097 * (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
8098 * and may use any of its valid forms, including negation, ranges, and escaping:
8099 *
8100 * # Negation.
8101 * 'hello'.tr('^aeiou', '-') # => "-e--o"
8102 * # Ranges.
8103 * 'ibm'.tr('b-z', 'a-z') # => "hal"
8104 * # Escapes.
8105 * 'hel^lo'.tr('\^aeiou', '-') # => "h-l-l-" # Escaped leading caret.
8106 * 'i-b-m'.tr('b\-z', 'a-z') # => "ibabm" # Escaped embedded hyphen.
8107 * 'foo\\bar'.tr('ab\\', 'XYZ') # => "fooZYXr" # Escaped backslash.
8108 *
8109 */
8110
8111static VALUE
8112rb_str_tr(VALUE str, VALUE src, VALUE repl)
8113{
8114 str = str_duplicate(rb_cString, str);
8115 tr_trans(str, src, repl, 0);
8116 return str;
8117}
8118
8119#define TR_TABLE_MAX (UCHAR_MAX+1)
8120#define TR_TABLE_SIZE (TR_TABLE_MAX+1)
8121static void
8122tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
8123 VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
8124{
8125 const unsigned int errc = -1;
8126 char buf[TR_TABLE_MAX];
8127 struct tr tr;
8128 unsigned int c;
8129 VALUE table = 0, ptable = 0;
8130 int i, l, cflag = 0;
8131
8132 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
8133 tr.gen = tr.now = tr.max = 0;
8134
8135 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
8136 cflag = 1;
8137 tr.p += l;
8138 }
8139 if (first) {
8140 for (i=0; i<TR_TABLE_MAX; i++) {
8141 stable[i] = 1;
8142 }
8143 stable[TR_TABLE_MAX] = cflag;
8144 }
8145 else if (stable[TR_TABLE_MAX] && !cflag) {
8146 stable[TR_TABLE_MAX] = 0;
8147 }
8148 for (i=0; i<TR_TABLE_MAX; i++) {
8149 buf[i] = cflag;
8150 }
8151
8152 while ((c = trnext(&tr, enc)) != errc) {
8153 if (c < TR_TABLE_MAX) {
8154 buf[(unsigned char)c] = !cflag;
8155 }
8156 else {
8157 VALUE key = UINT2NUM(c);
8158
8159 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
8160 if (cflag) {
8161 ptable = *ctablep;
8162 table = ptable ? ptable : rb_hash_new();
8163 *ctablep = table;
8164 }
8165 else {
8166 table = rb_hash_new();
8167 ptable = *tablep;
8168 *tablep = table;
8169 }
8170 }
8171 if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
8172 rb_hash_aset(table, key, Qtrue);
8173 }
8174 }
8175 }
8176 for (i=0; i<TR_TABLE_MAX; i++) {
8177 stable[i] = stable[i] && buf[i];
8178 }
8179 if (!table && !cflag) {
8180 *tablep = 0;
8181 }
8182}
8183
8184
8185static int
8186tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
8187{
8188 if (c < TR_TABLE_MAX) {
8189 return table[c] != 0;
8190 }
8191 else {
8192 VALUE v = UINT2NUM(c);
8193
8194 if (del) {
8195 if (!NIL_P(rb_hash_lookup(del, v)) &&
8196 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
8197 return TRUE;
8198 }
8199 }
8200 else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
8201 return FALSE;
8202 }
8203 return table[TR_TABLE_MAX] ? TRUE : FALSE;
8204 }
8205}
8206
8207/*
8208 * call-seq:
8209 * delete!(*selectors) -> self or nil
8210 *
8211 * Like String#delete, but modifies +self+ in place.
8212 * Returns +self+ if any changes were made, +nil+ otherwise.
8213 *
8214 */
8215
8216static VALUE
8217rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
8218{
8219 char squeez[TR_TABLE_SIZE];
8220 rb_encoding *enc = 0;
8221 char *s, *send, *t;
8222 VALUE del = 0, nodel = 0;
8223 int modify = 0;
8224 int i, ascompat, cr;
8225
8226 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8227 rb_check_arity(argc, 1, UNLIMITED_ARGUMENTS);
8228 for (i=0; i<argc; i++) {
8229 VALUE s = argv[i];
8230
8231 StringValue(s);
8232 enc = rb_enc_check(str, s);
8233 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8234 }
8235
8236 str_modify_keep_cr(str);
8237 ascompat = rb_enc_asciicompat(enc);
8238 s = t = RSTRING_PTR(str);
8239 send = RSTRING_END(str);
8240 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
8241 while (s < send) {
8242 unsigned int c;
8243 int clen;
8244
8245 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8246 if (squeez[c]) {
8247 modify = 1;
8248 }
8249 else {
8250 if (t != s) *t = c;
8251 t++;
8252 }
8253 s++;
8254 }
8255 else {
8256 c = rb_enc_codepoint_len(s, send, &clen, enc);
8257
8258 if (tr_find(c, squeez, del, nodel)) {
8259 modify = 1;
8260 }
8261 else {
8262 if (t != s) rb_enc_mbcput(c, t, enc);
8263 t += clen;
8265 }
8266 s += clen;
8267 }
8268 }
8269 TERM_FILL(t, TERM_LEN(str));
8270 STR_SET_LEN(str, t - RSTRING_PTR(str));
8271 ENC_CODERANGE_SET(str, cr);
8272
8273 if (modify) return str;
8274 return Qnil;
8275}
8276
8277
8278/*
8279 * call-seq:
8280 * delete(*selectors) -> new_string
8281 *
8282 * Returns a copy of +self+ with characters specified by +selectors+ removed
8283 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
8284 *
8285 * "hello".delete "l","lo" #=> "heo"
8286 * "hello".delete "lo" #=> "he"
8287 * "hello".delete "aeiou", "^e" #=> "hell"
8288 * "hello".delete "ej-m" #=> "ho"
8289 *
8290 */
8291
8292static VALUE
8293rb_str_delete(int argc, VALUE *argv, VALUE str)
8294{
8295 str = str_duplicate(rb_cString, str);
8296 rb_str_delete_bang(argc, argv, str);
8297 return str;
8298}
8299
8300
8301/*
8302 * call-seq:
8303 * squeeze!(*selectors) -> self or nil
8304 *
8305 * Like String#squeeze, but modifies +self+ in place.
8306 * Returns +self+ if any changes were made, +nil+ otherwise.
8307 */
8308
8309static VALUE
8310rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
8311{
8312 char squeez[TR_TABLE_SIZE];
8313 rb_encoding *enc = 0;
8314 VALUE del = 0, nodel = 0;
8315 unsigned char *s, *send, *t;
8316 int i, modify = 0;
8317 int ascompat, singlebyte = single_byte_optimizable(str);
8318 unsigned int save;
8319
8320 if (argc == 0) {
8321 enc = STR_ENC_GET(str);
8322 }
8323 else {
8324 for (i=0; i<argc; i++) {
8325 VALUE s = argv[i];
8326
8327 StringValue(s);
8328 enc = rb_enc_check(str, s);
8329 if (singlebyte && !single_byte_optimizable(s))
8330 singlebyte = 0;
8331 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8332 }
8333 }
8334
8335 str_modify_keep_cr(str);
8336 s = t = (unsigned char *)RSTRING_PTR(str);
8337 if (!s || RSTRING_LEN(str) == 0) return Qnil;
8338 send = (unsigned char *)RSTRING_END(str);
8339 save = -1;
8340 ascompat = rb_enc_asciicompat(enc);
8341
8342 if (singlebyte) {
8343 while (s < send) {
8344 unsigned int c = *s++;
8345 if (c != save || (argc > 0 && !squeez[c])) {
8346 *t++ = save = c;
8347 }
8348 }
8349 }
8350 else {
8351 while (s < send) {
8352 unsigned int c;
8353 int clen;
8354
8355 if (ascompat && (c = *s) < 0x80) {
8356 if (c != save || (argc > 0 && !squeez[c])) {
8357 *t++ = save = c;
8358 }
8359 s++;
8360 }
8361 else {
8362 c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, enc);
8363
8364 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
8365 if (t != s) rb_enc_mbcput(c, t, enc);
8366 save = c;
8367 t += clen;
8368 }
8369 s += clen;
8370 }
8371 }
8372 }
8373
8374 TERM_FILL((char *)t, TERM_LEN(str));
8375 if ((char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
8376 STR_SET_LEN(str, (char *)t - RSTRING_PTR(str));
8377 modify = 1;
8378 }
8379
8380 if (modify) return str;
8381 return Qnil;
8382}
8383
8384
8385/*
8386 * call-seq:
8387 * squeeze(*selectors) -> new_string
8388 *
8389 * Returns a copy of +self+ with characters specified by +selectors+ "squeezed"
8390 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
8391 *
8392 * "Squeezed" means that each multiple-character run of a selected character
8393 * is squeezed down to a single character;
8394 * with no arguments given, squeezes all characters:
8395 *
8396 * "yellow moon".squeeze #=> "yelow mon"
8397 * " now is the".squeeze(" ") #=> " now is the"
8398 * "putters shoot balls".squeeze("m-z") #=> "puters shot balls"
8399 *
8400 */
8401
8402static VALUE
8403rb_str_squeeze(int argc, VALUE *argv, VALUE str)
8404{
8405 str = str_duplicate(rb_cString, str);
8406 rb_str_squeeze_bang(argc, argv, str);
8407 return str;
8408}
8409
8410
8411/*
8412 * call-seq:
8413 * tr_s!(selector, replacements) -> self or nil
8414 *
8415 * Like String#tr_s, but modifies +self+ in place.
8416 * Returns +self+ if any changes were made, +nil+ otherwise.
8417 *
8418 * Related: String#squeeze!.
8419 */
8420
8421static VALUE
8422rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
8423{
8424 return tr_trans(str, src, repl, 1);
8425}
8426
8427
8428/*
8429 * call-seq:
8430 * tr_s(selector, replacements) -> string
8431 *
8432 * Like String#tr, but also squeezes the modified portions of the translated string;
8433 * returns a new string (translated and squeezed).
8434 *
8435 * 'hello'.tr_s('l', 'r') #=> "hero"
8436 * 'hello'.tr_s('el', '-') #=> "h-o"
8437 * 'hello'.tr_s('el', 'hx') #=> "hhxo"
8438 *
8439 * Related: String#squeeze.
8440 *
8441 */
8442
8443static VALUE
8444rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
8445{
8446 str = str_duplicate(rb_cString, str);
8447 tr_trans(str, src, repl, 1);
8448 return str;
8449}
8450
8451
8452/*
8453 * call-seq:
8454 * count(*selectors) -> integer
8455 *
8456 * Returns the total number of characters in +self+
8457 * that are specified by the given +selectors+
8458 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
8459 *
8460 * a = "hello world"
8461 * a.count "lo" #=> 5
8462 * a.count "lo", "o" #=> 2
8463 * a.count "hello", "^l" #=> 4
8464 * a.count "ej-m" #=> 4
8465 *
8466 * "hello^world".count "\\^aeiou" #=> 4
8467 * "hello-world".count "a\\-eo" #=> 4
8468 *
8469 * c = "hello world\\r\\n"
8470 * c.count "\\" #=> 2
8471 * c.count "\\A" #=> 0
8472 * c.count "X-\\w" #=> 3
8473 */
8474
8475static VALUE
8476rb_str_count(int argc, VALUE *argv, VALUE str)
8477{
8478 char table[TR_TABLE_SIZE];
8479 rb_encoding *enc = 0;
8480 VALUE del = 0, nodel = 0, tstr;
8481 char *s, *send;
8482 int i;
8483 int ascompat;
8484 size_t n = 0;
8485
8486 rb_check_arity(argc, 1, UNLIMITED_ARGUMENTS);
8487
8488 tstr = argv[0];
8489 StringValue(tstr);
8490 enc = rb_enc_check(str, tstr);
8491 if (argc == 1) {
8492 const char *ptstr;
8493 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
8494 (ptstr = RSTRING_PTR(tstr),
8495 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
8496 !is_broken_string(str)) {
8497 int clen;
8498 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
8499
8500 s = RSTRING_PTR(str);
8501 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
8502 send = RSTRING_END(str);
8503 while (s < send) {
8504 if (*(unsigned char*)s++ == c) n++;
8505 }
8506 return SIZET2NUM(n);
8507 }
8508 }
8509
8510 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
8511 for (i=1; i<argc; i++) {
8512 tstr = argv[i];
8513 StringValue(tstr);
8514 enc = rb_enc_check(str, tstr);
8515 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
8516 }
8517
8518 s = RSTRING_PTR(str);
8519 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
8520 send = RSTRING_END(str);
8521 ascompat = rb_enc_asciicompat(enc);
8522 while (s < send) {
8523 unsigned int c;
8524
8525 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8526 if (table[c]) {
8527 n++;
8528 }
8529 s++;
8530 }
8531 else {
8532 int clen;
8533 c = rb_enc_codepoint_len(s, send, &clen, enc);
8534 if (tr_find(c, table, del, nodel)) {
8535 n++;
8536 }
8537 s += clen;
8538 }
8539 }
8540
8541 return SIZET2NUM(n);
8542}
8543
8544static VALUE
8545rb_fs_check(VALUE val)
8546{
8547 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING) && !RB_TYPE_P(val, T_REGEXP)) {
8548 val = rb_check_string_type(val);
8549 if (NIL_P(val)) return 0;
8550 }
8551 return val;
8552}
8553
8554static const char isspacetable[256] = {
8555 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
8556 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8557 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8558 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8559 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8560 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8561 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8562 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8563 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8564 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8565 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8566 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8567 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8568 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8569 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8570 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
8571};
8572
8573#define ascii_isspace(c) isspacetable[(unsigned char)(c)]
8574
8575static long
8576split_string(VALUE result, VALUE str, long beg, long len, long empty_count)
8577{
8578 if (empty_count >= 0 && len == 0) {
8579 return empty_count + 1;
8580 }
8581 if (empty_count > 0) {
8582 /* make different substrings */
8583 if (result) {
8584 do {
8585 rb_ary_push(result, str_new_empty_String(str));
8586 } while (--empty_count > 0);
8587 }
8588 else {
8589 do {
8590 rb_yield(str_new_empty_String(str));
8591 } while (--empty_count > 0);
8592 }
8593 }
8594 str = rb_str_subseq(str, beg, len);
8595 if (result) {
8596 rb_ary_push(result, str);
8597 }
8598 else {
8599 rb_yield(str);
8600 }
8601 return empty_count;
8602}
8603
8604typedef enum {
8605 SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
8606} split_type_t;
8607
8608static split_type_t
8609literal_split_pattern(VALUE spat, split_type_t default_type)
8610{
8611 rb_encoding *enc = STR_ENC_GET(spat);
8612 const char *ptr;
8613 long len;
8614 RSTRING_GETMEM(spat, ptr, len);
8615 if (len == 0) {
8616 /* Special case - split into chars */
8617 return SPLIT_TYPE_CHARS;
8618 }
8619 else if (rb_enc_asciicompat(enc)) {
8620 if (len == 1 && ptr[0] == ' ') {
8621 return SPLIT_TYPE_AWK;
8622 }
8623 }
8624 else {
8625 int l;
8626 if (rb_enc_ascget(ptr, ptr + len, &l, enc) == ' ' && len == l) {
8627 return SPLIT_TYPE_AWK;
8628 }
8629 }
8630 return default_type;
8631}
8632
8633/*
8634 * call-seq:
8635 * split(field_sep = $;, limit = nil) -> array
8636 * split(field_sep = $;, limit = nil) {|substring| ... } -> self
8637 *
8638 * :include: doc/string/split.rdoc
8639 *
8640 */
8641
8642static VALUE
8643rb_str_split_m(int argc, VALUE *argv, VALUE str)
8644{
8645 rb_encoding *enc;
8646 VALUE spat;
8647 VALUE limit;
8648 split_type_t split_type;
8649 long beg, end, i = 0, empty_count = -1;
8650 int lim = 0;
8651 VALUE result, tmp;
8652
8653 result = rb_block_given_p() ? Qfalse : Qnil;
8654 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
8655 lim = NUM2INT(limit);
8656 if (lim <= 0) limit = Qnil;
8657 else if (lim == 1) {
8658 if (RSTRING_LEN(str) == 0)
8659 return result ? rb_ary_new2(0) : str;
8660 tmp = str_duplicate(rb_cString, str);
8661 if (!result) {
8662 rb_yield(tmp);
8663 return str;
8664 }
8665 return rb_ary_new3(1, tmp);
8666 }
8667 i = 1;
8668 }
8669 if (NIL_P(limit) && !lim) empty_count = 0;
8670
8671 enc = STR_ENC_GET(str);
8672 split_type = SPLIT_TYPE_REGEXP;
8673 if (!NIL_P(spat)) {
8674 spat = get_pat_quoted(spat, 0);
8675 }
8676 else if (NIL_P(spat = rb_fs)) {
8677 split_type = SPLIT_TYPE_AWK;
8678 }
8679 else if (!(spat = rb_fs_check(spat))) {
8680 rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
8681 }
8682 else {
8683 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$; is set to non-nil value");
8684 }
8685 if (split_type != SPLIT_TYPE_AWK) {
8686 switch (BUILTIN_TYPE(spat)) {
8687 case T_REGEXP:
8688 rb_reg_options(spat); /* check if uninitialized */
8689 tmp = RREGEXP_SRC(spat);
8690 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
8691 if (split_type == SPLIT_TYPE_AWK) {
8692 spat = tmp;
8693 split_type = SPLIT_TYPE_STRING;
8694 }
8695 break;
8696
8697 case T_STRING:
8698 mustnot_broken(spat);
8699 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
8700 break;
8701
8702 default:
8704 }
8705 }
8706
8707#define SPLIT_STR(beg, len) (empty_count = split_string(result, str, beg, len, empty_count))
8708
8709 if (result) result = rb_ary_new();
8710 beg = 0;
8711 char *ptr = RSTRING_PTR(str);
8712 char *eptr = RSTRING_END(str);
8713 if (split_type == SPLIT_TYPE_AWK) {
8714 char *bptr = ptr;
8715 int skip = 1;
8716 unsigned int c;
8717
8718 end = beg;
8719 if (is_ascii_string(str)) {
8720 while (ptr < eptr) {
8721 c = (unsigned char)*ptr++;
8722 if (skip) {
8723 if (ascii_isspace(c)) {
8724 beg = ptr - bptr;
8725 }
8726 else {
8727 end = ptr - bptr;
8728 skip = 0;
8729 if (!NIL_P(limit) && lim <= i) break;
8730 }
8731 }
8732 else if (ascii_isspace(c)) {
8733 SPLIT_STR(beg, end-beg);
8734 skip = 1;
8735 beg = ptr - bptr;
8736 if (!NIL_P(limit)) ++i;
8737 }
8738 else {
8739 end = ptr - bptr;
8740 }
8741 }
8742 }
8743 else {
8744 while (ptr < eptr) {
8745 int n;
8746
8747 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
8748 ptr += n;
8749 if (skip) {
8750 if (rb_isspace(c)) {
8751 beg = ptr - bptr;
8752 }
8753 else {
8754 end = ptr - bptr;
8755 skip = 0;
8756 if (!NIL_P(limit) && lim <= i) break;
8757 }
8758 }
8759 else if (rb_isspace(c)) {
8760 SPLIT_STR(beg, end-beg);
8761 skip = 1;
8762 beg = ptr - bptr;
8763 if (!NIL_P(limit)) ++i;
8764 }
8765 else {
8766 end = ptr - bptr;
8767 }
8768 }
8769 }
8770 }
8771 else if (split_type == SPLIT_TYPE_STRING) {
8772 char *str_start = ptr;
8773 char *substr_start = ptr;
8774 char *sptr = RSTRING_PTR(spat);
8775 long slen = RSTRING_LEN(spat);
8776
8777 mustnot_broken(str);
8778 enc = rb_enc_check(str, spat);
8779 while (ptr < eptr &&
8780 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
8781 /* Check we are at the start of a char */
8782 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
8783 if (t != ptr + end) {
8784 ptr = t;
8785 continue;
8786 }
8787 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
8788 ptr += end + slen;
8789 substr_start = ptr;
8790 if (!NIL_P(limit) && lim <= ++i) break;
8791 }
8792 beg = ptr - str_start;
8793 }
8794 else if (split_type == SPLIT_TYPE_CHARS) {
8795 char *str_start = ptr;
8796 int n;
8797
8798 mustnot_broken(str);
8799 enc = rb_enc_get(str);
8800 while (ptr < eptr &&
8801 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
8802 SPLIT_STR(ptr - str_start, n);
8803 ptr += n;
8804 if (!NIL_P(limit) && lim <= ++i) break;
8805 }
8806 beg = ptr - str_start;
8807 }
8808 else {
8809 long len = RSTRING_LEN(str);
8810 long start = beg;
8811 long idx;
8812 int last_null = 0;
8813 struct re_registers *regs;
8814 VALUE match = 0;
8815
8816 for (; rb_reg_search(spat, str, start, 0) >= 0;
8817 (match ? (rb_match_unbusy(match), rb_backref_set(match)) : (void)0)) {
8818 match = rb_backref_get();
8819 if (!result) rb_match_busy(match);
8820 regs = RMATCH_REGS(match);
8821 end = BEG(0);
8822 if (start == end && BEG(0) == END(0)) {
8823 if (!ptr) {
8824 SPLIT_STR(0, 0);
8825 break;
8826 }
8827 else if (last_null == 1) {
8828 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
8829 beg = start;
8830 }
8831 else {
8832 if (start == len)
8833 start++;
8834 else
8835 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
8836 last_null = 1;
8837 continue;
8838 }
8839 }
8840 else {
8841 SPLIT_STR(beg, end-beg);
8842 beg = start = END(0);
8843 }
8844 last_null = 0;
8845
8846 for (idx=1; idx < regs->num_regs; idx++) {
8847 if (BEG(idx) == -1) continue;
8848 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
8849 }
8850 if (!NIL_P(limit) && lim <= ++i) break;
8851 }
8852 if (match) rb_match_unbusy(match);
8853 }
8854 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
8855 SPLIT_STR(beg, RSTRING_LEN(str)-beg);
8856 }
8857
8858 return result ? result : str;
8859}
8860
8861VALUE
8862rb_str_split(VALUE str, const char *sep0)
8863{
8864 VALUE sep;
8865
8866 StringValue(str);
8867 sep = rb_str_new_cstr(sep0);
8868 return rb_str_split_m(1, &sep, str);
8869}
8870
8871#define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
8872
8873static inline int
8874enumerator_element(VALUE ary, VALUE e)
8875{
8876 if (ary) {
8877 rb_ary_push(ary, e);
8878 return 0;
8879 }
8880 else {
8881 rb_yield(e);
8882 return 1;
8883 }
8884}
8885
8886#define ENUM_ELEM(ary, e) enumerator_element(ary, e)
8887
8888static const char *
8889chomp_newline(const char *p, const char *e, rb_encoding *enc)
8890{
8891 const char *prev = rb_enc_prev_char(p, e, e, enc);
8892 if (rb_enc_is_newline(prev, e, enc)) {
8893 e = prev;
8894 prev = rb_enc_prev_char(p, e, e, enc);
8895 if (prev && rb_enc_ascget(prev, e, NULL, enc) == '\r')
8896 e = prev;
8897 }
8898 return e;
8899}
8900
8901static VALUE
8902get_rs(void)
8903{
8904 VALUE rs = rb_rs;
8905 if (!NIL_P(rs) &&
8906 (!RB_TYPE_P(rs, T_STRING) ||
8907 RSTRING_LEN(rs) != 1 ||
8908 RSTRING_PTR(rs)[0] != '\n')) {
8909 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$/ is set to non-default value");
8910 }
8911 return rs;
8912}
8913
8914#define rb_rs get_rs()
8915
8916static VALUE
8917rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, VALUE ary)
8918{
8919 rb_encoding *enc;
8920 VALUE line, rs, orig = str, opts = Qnil, chomp = Qfalse;
8921 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
8922 long pos, len, rslen;
8923 int rsnewline = 0;
8924
8925 if (rb_scan_args(argc, argv, "01:", &rs, &opts) == 0)
8926 rs = rb_rs;
8927 if (!NIL_P(opts)) {
8928 static ID keywords[1];
8929 if (!keywords[0]) {
8930 keywords[0] = rb_intern_const("chomp");
8931 }
8932 rb_get_kwargs(opts, keywords, 0, 1, &chomp);
8933 chomp = (!UNDEF_P(chomp) && RTEST(chomp));
8934 }
8935
8936 if (NIL_P(rs)) {
8937 if (!ENUM_ELEM(ary, str)) {
8938 return ary;
8939 }
8940 else {
8941 return orig;
8942 }
8943 }
8944
8945 if (!RSTRING_LEN(str)) goto end;
8946 str = rb_str_new_frozen(str);
8947 ptr = subptr = RSTRING_PTR(str);
8948 pend = RSTRING_END(str);
8949 len = RSTRING_LEN(str);
8950 StringValue(rs);
8951 rslen = RSTRING_LEN(rs);
8952
8953 if (rs == rb_default_rs)
8954 enc = rb_enc_get(str);
8955 else
8956 enc = rb_enc_check(str, rs);
8957
8958 if (rslen == 0) {
8959 /* paragraph mode */
8960 int n;
8961 const char *eol = NULL;
8962 subend = subptr;
8963 while (subend < pend) {
8964 long chomp_rslen = 0;
8965 do {
8966 if (rb_enc_ascget(subend, pend, &n, enc) != '\r')
8967 n = 0;
8968 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
8969 if (rb_enc_is_newline(subend + n, pend, enc)) {
8970 if (eol == subend) break;
8971 subend += rslen;
8972 if (subptr) {
8973 eol = subend;
8974 chomp_rslen = -rslen;
8975 }
8976 }
8977 else {
8978 if (!subptr) subptr = subend;
8979 subend += rslen;
8980 }
8981 rslen = 0;
8982 } while (subend < pend);
8983 if (!subptr) break;
8984 if (rslen == 0) chomp_rslen = 0;
8985 line = rb_str_subseq(str, subptr - ptr,
8986 subend - subptr + (chomp ? chomp_rslen : rslen));
8987 if (ENUM_ELEM(ary, line)) {
8988 str_mod_check(str, ptr, len);
8989 }
8990 subptr = eol = NULL;
8991 }
8992 goto end;
8993 }
8994 else {
8995 rsptr = RSTRING_PTR(rs);
8996 if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
8997 rb_enc_is_newline(rsptr, rsptr + RSTRING_LEN(rs), enc)) {
8998 rsnewline = 1;
8999 }
9000 }
9001
9002 if ((rs == rb_default_rs) && !rb_enc_asciicompat(enc)) {
9003 rs = rb_str_new(rsptr, rslen);
9004 rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
9005 rsptr = RSTRING_PTR(rs);
9006 rslen = RSTRING_LEN(rs);
9007 }
9008
9009 while (subptr < pend) {
9010 pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
9011 if (pos < 0) break;
9012 hit = subptr + pos;
9013 adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
9014 if (hit != adjusted) {
9015 subptr = adjusted;
9016 continue;
9017 }
9018 subend = hit += rslen;
9019 if (chomp) {
9020 if (rsnewline) {
9021 subend = chomp_newline(subptr, subend, enc);
9022 }
9023 else {
9024 subend -= rslen;
9025 }
9026 }
9027 line = rb_str_subseq(str, subptr - ptr, subend - subptr);
9028 if (ENUM_ELEM(ary, line)) {
9029 str_mod_check(str, ptr, len);
9030 }
9031 subptr = hit;
9032 }
9033
9034 if (subptr != pend) {
9035 if (chomp) {
9036 if (rsnewline) {
9037 pend = chomp_newline(subptr, pend, enc);
9038 }
9039 else if (pend - subptr >= rslen &&
9040 memcmp(pend - rslen, rsptr, rslen) == 0) {
9041 pend -= rslen;
9042 }
9043 }
9044 line = rb_str_subseq(str, subptr - ptr, pend - subptr);
9045 ENUM_ELEM(ary, line);
9046 RB_GC_GUARD(str);
9047 }
9048
9049 end:
9050 if (ary)
9051 return ary;
9052 else
9053 return orig;
9054}
9055
9056/*
9057 * call-seq:
9058 * each_line(line_sep = $/, chomp: false) {|substring| ... } -> self
9059 * each_line(line_sep = $/, chomp: false) -> enumerator
9060 *
9061 * :include: doc/string/each_line.rdoc
9062 *
9063 */
9064
9065static VALUE
9066rb_str_each_line(int argc, VALUE *argv, VALUE str)
9067{
9068 RETURN_SIZED_ENUMERATOR(str, argc, argv, 0);
9069 return rb_str_enumerate_lines(argc, argv, str, 0);
9070}
9071
9072/*
9073 * call-seq:
9074 * lines(Line_sep = $/, chomp: false) -> array_of_strings
9075 *
9076 * Forms substrings ("lines") of +self+ according to the given arguments
9077 * (see String#each_line for details); returns the lines in an array.
9078 *
9079 */
9080
9081static VALUE
9082rb_str_lines(int argc, VALUE *argv, VALUE str)
9083{
9084 VALUE ary = WANTARRAY("lines", 0);
9085 return rb_str_enumerate_lines(argc, argv, str, ary);
9086}
9087
9088static VALUE
9089rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
9090{
9091 return LONG2FIX(RSTRING_LEN(str));
9092}
9093
9094static VALUE
9095rb_str_enumerate_bytes(VALUE str, VALUE ary)
9096{
9097 long i;
9098
9099 for (i=0; i<RSTRING_LEN(str); i++) {
9100 ENUM_ELEM(ary, INT2FIX((unsigned char)RSTRING_PTR(str)[i]));
9101 }
9102 if (ary)
9103 return ary;
9104 else
9105 return str;
9106}
9107
9108/*
9109 * call-seq:
9110 * each_byte {|byte| ... } -> self
9111 * each_byte -> enumerator
9112 *
9113 * :include: doc/string/each_byte.rdoc
9114 *
9115 */
9116
9117static VALUE
9118rb_str_each_byte(VALUE str)
9119{
9120 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
9121 return rb_str_enumerate_bytes(str, 0);
9122}
9123
9124/*
9125 * call-seq:
9126 * bytes -> array_of_bytes
9127 *
9128 * :include: doc/string/bytes.rdoc
9129 *
9130 */
9131
9132static VALUE
9133rb_str_bytes(VALUE str)
9134{
9135 VALUE ary = WANTARRAY("bytes", RSTRING_LEN(str));
9136 return rb_str_enumerate_bytes(str, ary);
9137}
9138
9139static VALUE
9140rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
9141{
9142 return rb_str_length(str);
9143}
9144
9145static VALUE
9146rb_str_enumerate_chars(VALUE str, VALUE ary)
9147{
9148 VALUE orig = str;
9149 long i, len, n;
9150 const char *ptr;
9151 rb_encoding *enc;
9152
9153 str = rb_str_new_frozen(str);
9154 ptr = RSTRING_PTR(str);
9155 len = RSTRING_LEN(str);
9156 enc = rb_enc_get(str);
9157
9159 for (i = 0; i < len; i += n) {
9160 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
9161 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9162 }
9163 }
9164 else {
9165 for (i = 0; i < len; i += n) {
9166 n = rb_enc_mbclen(ptr + i, ptr + len, enc);
9167 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9168 }
9169 }
9170 RB_GC_GUARD(str);
9171 if (ary)
9172 return ary;
9173 else
9174 return orig;
9175}
9176
9177/*
9178 * call-seq:
9179 * each_char {|c| ... } -> self
9180 * each_char -> enumerator
9181 *
9182 * :include: doc/string/each_char.rdoc
9183 *
9184 */
9185
9186static VALUE
9187rb_str_each_char(VALUE str)
9188{
9189 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9190 return rb_str_enumerate_chars(str, 0);
9191}
9192
9193/*
9194 * call-seq:
9195 * chars -> array_of_characters
9196 *
9197 * :include: doc/string/chars.rdoc
9198 *
9199 */
9200
9201static VALUE
9202rb_str_chars(VALUE str)
9203{
9204 VALUE ary = WANTARRAY("chars", rb_str_strlen(str));
9205 return rb_str_enumerate_chars(str, ary);
9206}
9207
9208static VALUE
9209rb_str_enumerate_codepoints(VALUE str, VALUE ary)
9210{
9211 VALUE orig = str;
9212 int n;
9213 unsigned int c;
9214 const char *ptr, *end;
9215 rb_encoding *enc;
9216
9217 if (single_byte_optimizable(str))
9218 return rb_str_enumerate_bytes(str, ary);
9219
9220 str = rb_str_new_frozen(str);
9221 ptr = RSTRING_PTR(str);
9222 end = RSTRING_END(str);
9223 enc = STR_ENC_GET(str);
9224
9225 while (ptr < end) {
9226 c = rb_enc_codepoint_len(ptr, end, &n, enc);
9227 ENUM_ELEM(ary, UINT2NUM(c));
9228 ptr += n;
9229 }
9230 RB_GC_GUARD(str);
9231 if (ary)
9232 return ary;
9233 else
9234 return orig;
9235}
9236
9237/*
9238 * call-seq:
9239 * each_codepoint {|integer| ... } -> self
9240 * each_codepoint -> enumerator
9241 *
9242 * :include: doc/string/each_codepoint.rdoc
9243 *
9244 */
9245
9246static VALUE
9247rb_str_each_codepoint(VALUE str)
9248{
9249 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9250 return rb_str_enumerate_codepoints(str, 0);
9251}
9252
9253/*
9254 * call-seq:
9255 * codepoints -> array_of_integers
9256 *
9257 * :include: doc/string/codepoints.rdoc
9258 *
9259 */
9260
9261static VALUE
9262rb_str_codepoints(VALUE str)
9263{
9264 VALUE ary = WANTARRAY("codepoints", rb_str_strlen(str));
9265 return rb_str_enumerate_codepoints(str, ary);
9266}
9267
9268static regex_t *
9269get_reg_grapheme_cluster(rb_encoding *enc)
9270{
9271 int encidx = rb_enc_to_index(enc);
9272 regex_t *reg_grapheme_cluster = NULL;
9273 static regex_t *reg_grapheme_cluster_utf8 = NULL;
9274
9275 /* synchronize */
9276 if (encidx == rb_utf8_encindex() && reg_grapheme_cluster_utf8) {
9277 reg_grapheme_cluster = reg_grapheme_cluster_utf8;
9278 }
9279 if (!reg_grapheme_cluster) {
9280 const OnigUChar source_ascii[] = "\\X";
9281 OnigErrorInfo einfo;
9282 const OnigUChar *source = source_ascii;
9283 size_t source_len = sizeof(source_ascii) - 1;
9284 switch (encidx) {
9285#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9286#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9287#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9288#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9289#define CASE_UTF(e) \
9290 case ENCINDEX_UTF_##e: { \
9291 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9292 source = source_UTF_##e; \
9293 source_len = sizeof(source_UTF_##e); \
9294 break; \
9295 }
9296 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9297#undef CASE_UTF
9298#undef CHARS_16BE
9299#undef CHARS_16LE
9300#undef CHARS_32BE
9301#undef CHARS_32LE
9302 }
9303 int r = onig_new(&reg_grapheme_cluster, source, source + source_len,
9304 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9305 if (r) {
9306 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9307 onig_error_code_to_str(message, r, &einfo);
9308 rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
9309 }
9310 if (encidx == rb_utf8_encindex()) {
9311 reg_grapheme_cluster_utf8 = reg_grapheme_cluster;
9312 }
9313 }
9314 return reg_grapheme_cluster;
9315}
9316
9317static VALUE
9318rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
9319{
9320 size_t grapheme_cluster_count = 0;
9321 regex_t *reg_grapheme_cluster = NULL;
9322 rb_encoding *enc = get_encoding(str);
9323 const char *ptr, *end;
9324
9325 if (!rb_enc_unicode_p(enc)) {
9326 return rb_str_length(str);
9327 }
9328
9329 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9330 ptr = RSTRING_PTR(str);
9331 end = RSTRING_END(str);
9332
9333 while (ptr < end) {
9334 OnigPosition len = onig_match(reg_grapheme_cluster,
9335 (const OnigUChar *)ptr, (const OnigUChar *)end,
9336 (const OnigUChar *)ptr, NULL, 0);
9337 if (len <= 0) break;
9338 grapheme_cluster_count++;
9339 ptr += len;
9340 }
9341
9342 return SIZET2NUM(grapheme_cluster_count);
9343}
9344
9345static VALUE
9346rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
9347{
9348 VALUE orig = str;
9349 regex_t *reg_grapheme_cluster = NULL;
9350 rb_encoding *enc = get_encoding(str);
9351 const char *ptr0, *ptr, *end;
9352
9353 if (!rb_enc_unicode_p(enc)) {
9354 return rb_str_enumerate_chars(str, ary);
9355 }
9356
9357 if (!ary) str = rb_str_new_frozen(str);
9358 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9359 ptr0 = ptr = RSTRING_PTR(str);
9360 end = RSTRING_END(str);
9361
9362 while (ptr < end) {
9363 OnigPosition len = onig_match(reg_grapheme_cluster,
9364 (const OnigUChar *)ptr, (const OnigUChar *)end,
9365 (const OnigUChar *)ptr, NULL, 0);
9366 if (len <= 0) break;
9367 ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len));
9368 ptr += len;
9369 }
9370 RB_GC_GUARD(str);
9371 if (ary)
9372 return ary;
9373 else
9374 return orig;
9375}
9376
9377/*
9378 * call-seq:
9379 * each_grapheme_cluster {|gc| ... } -> self
9380 * each_grapheme_cluster -> enumerator
9381 *
9382 * :include: doc/string/each_grapheme_cluster.rdoc
9383 *
9384 */
9385
9386static VALUE
9387rb_str_each_grapheme_cluster(VALUE str)
9388{
9389 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size);
9390 return rb_str_enumerate_grapheme_clusters(str, 0);
9391}
9392
9393/*
9394 * call-seq:
9395 * grapheme_clusters -> array_of_grapheme_clusters
9396 *
9397 * :include: doc/string/grapheme_clusters.rdoc
9398 *
9399 */
9400
9401static VALUE
9402rb_str_grapheme_clusters(VALUE str)
9403{
9404 VALUE ary = WANTARRAY("grapheme_clusters", rb_str_strlen(str));
9405 return rb_str_enumerate_grapheme_clusters(str, ary);
9406}
9407
9408static long
9409chopped_length(VALUE str)
9410{
9411 rb_encoding *enc = STR_ENC_GET(str);
9412 const char *p, *p2, *beg, *end;
9413
9414 beg = RSTRING_PTR(str);
9415 end = beg + RSTRING_LEN(str);
9416 if (beg >= end) return 0;
9417 p = rb_enc_prev_char(beg, end, end, enc);
9418 if (!p) return 0;
9419 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
9420 p2 = rb_enc_prev_char(beg, p, end, enc);
9421 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
9422 }
9423 return p - beg;
9424}
9425
9426/*
9427 * call-seq:
9428 * chop! -> self or nil
9429 *
9430 * Like String#chop, but modifies +self+ in place;
9431 * returns +nil+ if +self+ is empty, +self+ otherwise.
9432 *
9433 * Related: String#chomp!.
9434 */
9435
9436static VALUE
9437rb_str_chop_bang(VALUE str)
9438{
9439 str_modify_keep_cr(str);
9440 if (RSTRING_LEN(str) > 0) {
9441 long len;
9442 len = chopped_length(str);
9443 STR_SET_LEN(str, len);
9444 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
9445 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
9447 }
9448 return str;
9449 }
9450 return Qnil;
9451}
9452
9453
9454/*
9455 * call-seq:
9456 * chop -> new_string
9457 *
9458 * :include: doc/string/chop.rdoc
9459 *
9460 */
9461
9462static VALUE
9463rb_str_chop(VALUE str)
9464{
9465 return rb_str_subseq(str, 0, chopped_length(str));
9466}
9467
9468static long
9469smart_chomp(VALUE str, const char *e, const char *p)
9470{
9471 rb_encoding *enc = rb_enc_get(str);
9472 if (rb_enc_mbminlen(enc) > 1) {
9473 const char *pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
9474 if (rb_enc_is_newline(pp, e, enc)) {
9475 e = pp;
9476 }
9477 pp = e - rb_enc_mbminlen(enc);
9478 if (pp >= p) {
9479 pp = rb_enc_left_char_head(p, pp, e, enc);
9480 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
9481 e = pp;
9482 }
9483 }
9484 }
9485 else {
9486 switch (*(e-1)) { /* not e[-1] to get rid of VC bug */
9487 case '\n':
9488 if (--e > p && *(e-1) == '\r') {
9489 --e;
9490 }
9491 break;
9492 case '\r':
9493 --e;
9494 break;
9495 }
9496 }
9497 return e - p;
9498}
9499
9500static long
9501chompped_length(VALUE str, VALUE rs)
9502{
9503 rb_encoding *enc;
9504 int newline;
9505 char *pp, *e, *rsptr;
9506 long rslen;
9507 char *const p = RSTRING_PTR(str);
9508 long len = RSTRING_LEN(str);
9509
9510 if (len == 0) return 0;
9511 e = p + len;
9512 if (rs == rb_default_rs) {
9513 return smart_chomp(str, e, p);
9514 }
9515
9516 enc = rb_enc_get(str);
9517 RSTRING_GETMEM(rs, rsptr, rslen);
9518 if (rslen == 0) {
9519 if (rb_enc_mbminlen(enc) > 1) {
9520 while (e > p) {
9521 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
9522 if (!rb_enc_is_newline(pp, e, enc)) break;
9523 e = pp;
9524 pp -= rb_enc_mbminlen(enc);
9525 if (pp >= p) {
9526 pp = rb_enc_left_char_head(p, pp, e, enc);
9527 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
9528 e = pp;
9529 }
9530 }
9531 }
9532 }
9533 else {
9534 while (e > p && *(e-1) == '\n') {
9535 --e;
9536 if (e > p && *(e-1) == '\r')
9537 --e;
9538 }
9539 }
9540 return e - p;
9541 }
9542 if (rslen > len) return len;
9543
9544 enc = rb_enc_get(rs);
9545 newline = rsptr[rslen-1];
9546 if (rslen == rb_enc_mbminlen(enc)) {
9547 if (rslen == 1) {
9548 if (newline == '\n')
9549 return smart_chomp(str, e, p);
9550 }
9551 else {
9552 if (rb_enc_is_newline(rsptr, rsptr+rslen, enc))
9553 return smart_chomp(str, e, p);
9554 }
9555 }
9556
9557 enc = rb_enc_check(str, rs);
9558 if (is_broken_string(rs)) {
9559 return len;
9560 }
9561 pp = e - rslen;
9562 if (p[len-1] == newline &&
9563 (rslen <= 1 ||
9564 memcmp(rsptr, pp, rslen) == 0)) {
9565 if (rb_enc_left_char_head(p, pp, e, enc) == pp)
9566 return len - rslen;
9567 RB_GC_GUARD(rs);
9568 }
9569 return len;
9570}
9571
9577static VALUE
9578chomp_rs(int argc, const VALUE *argv)
9579{
9580 rb_check_arity(argc, 0, 1);
9581 if (argc > 0) {
9582 VALUE rs = argv[0];
9583 if (!NIL_P(rs)) StringValue(rs);
9584 return rs;
9585 }
9586 else {
9587 return rb_rs;
9588 }
9589}
9590
9591VALUE
9592rb_str_chomp_string(VALUE str, VALUE rs)
9593{
9594 long olen = RSTRING_LEN(str);
9595 long len = chompped_length(str, rs);
9596 if (len >= olen) return Qnil;
9597 str_modify_keep_cr(str);
9598 STR_SET_LEN(str, len);
9599 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
9600 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
9602 }
9603 return str;
9604}
9605
9606/*
9607 * call-seq:
9608 * chomp!(line_sep = $/) -> self or nil
9609 *
9610 * Like String#chomp, but modifies +self+ in place;
9611 * returns +nil+ if no modification made, +self+ otherwise.
9612 *
9613 */
9614
9615static VALUE
9616rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
9617{
9618 VALUE rs;
9619 str_modifiable(str);
9620 if (RSTRING_LEN(str) == 0) return Qnil;
9621 rs = chomp_rs(argc, argv);
9622 if (NIL_P(rs)) return Qnil;
9623 return rb_str_chomp_string(str, rs);
9624}
9625
9626
9627/*
9628 * call-seq:
9629 * chomp(line_sep = $/) -> new_string
9630 *
9631 * :include: doc/string/chomp.rdoc
9632 *
9633 */
9634
9635static VALUE
9636rb_str_chomp(int argc, VALUE *argv, VALUE str)
9637{
9638 VALUE rs = chomp_rs(argc, argv);
9639 if (NIL_P(rs)) return str_duplicate(rb_cString, str);
9640 return rb_str_subseq(str, 0, chompped_length(str, rs));
9641}
9642
9643static long
9644lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
9645{
9646 const char *const start = s;
9647
9648 if (!s || s >= e) return 0;
9649
9650 /* remove spaces at head */
9651 if (single_byte_optimizable(str)) {
9652 while (s < e && (*s == '\0' || ascii_isspace(*s))) s++;
9653 }
9654 else {
9655 while (s < e) {
9656 int n;
9657 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
9658
9659 if (cc && !rb_isspace(cc)) break;
9660 s += n;
9661 }
9662 }
9663 return s - start;
9664}
9665
9666/*
9667 * call-seq:
9668 * lstrip! -> self or nil
9669 *
9670 * Like String#lstrip, except that any modifications are made in +self+;
9671 * returns +self+ if any modification are made, +nil+ otherwise.
9672 *
9673 * Related: String#rstrip!, String#strip!.
9674 */
9675
9676static VALUE
9677rb_str_lstrip_bang(VALUE str)
9678{
9679 rb_encoding *enc;
9680 char *start, *s;
9681 long olen, loffset;
9682
9683 str_modify_keep_cr(str);
9684 enc = STR_ENC_GET(str);
9685 RSTRING_GETMEM(str, start, olen);
9686 loffset = lstrip_offset(str, start, start+olen, enc);
9687 if (loffset > 0) {
9688 long len = olen-loffset;
9689 s = start + loffset;
9690 memmove(start, s, len);
9691 STR_SET_LEN(str, len);
9692 TERM_FILL(start+len, rb_enc_mbminlen(enc));
9693 return str;
9694 }
9695 return Qnil;
9696}
9697
9698
9699/*
9700 * call-seq:
9701 * lstrip -> new_string
9702 *
9703 * Returns a copy of +self+ with leading whitespace removed;
9704 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
9705 *
9706 * whitespace = "\x00\t\n\v\f\r "
9707 * s = whitespace + 'abc' + whitespace
9708 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
9709 * s.lstrip # => "abc\u0000\t\n\v\f\r "
9710 *
9711 * Related: String#rstrip, String#strip.
9712 */
9713
9714static VALUE
9715rb_str_lstrip(VALUE str)
9716{
9717 char *start;
9718 long len, loffset;
9719 RSTRING_GETMEM(str, start, len);
9720 loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
9721 if (loffset <= 0) return str_duplicate(rb_cString, str);
9722 return rb_str_subseq(str, loffset, len - loffset);
9723}
9724
9725static long
9726rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
9727{
9728 const char *t;
9729
9730 rb_str_check_dummy_enc(enc);
9732 rb_raise(rb_eEncCompatError, "invalid byte sequence in %s", rb_enc_name(enc));
9733 }
9734 if (!s || s >= e) return 0;
9735 t = e;
9736
9737 /* remove trailing spaces or '\0's */
9738 if (single_byte_optimizable(str)) {
9739 unsigned char c;
9740 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
9741 }
9742 else {
9743 char *tp;
9744
9745 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
9746 unsigned int c = rb_enc_codepoint(tp, e, enc);
9747 if (c && !rb_isspace(c)) break;
9748 t = tp;
9749 }
9750 }
9751 return e - t;
9752}
9753
9754/*
9755 * call-seq:
9756 * rstrip! -> self or nil
9757 *
9758 * Like String#rstrip, except that any modifications are made in +self+;
9759 * returns +self+ if any modification are made, +nil+ otherwise.
9760 *
9761 * Related: String#lstrip!, String#strip!.
9762 */
9763
9764static VALUE
9765rb_str_rstrip_bang(VALUE str)
9766{
9767 rb_encoding *enc;
9768 char *start;
9769 long olen, roffset;
9770
9771 str_modify_keep_cr(str);
9772 enc = STR_ENC_GET(str);
9773 RSTRING_GETMEM(str, start, olen);
9774 roffset = rstrip_offset(str, start, start+olen, enc);
9775 if (roffset > 0) {
9776 long len = olen - roffset;
9777
9778 STR_SET_LEN(str, len);
9779 TERM_FILL(start+len, rb_enc_mbminlen(enc));
9780 return str;
9781 }
9782 return Qnil;
9783}
9784
9785
9786/*
9787 * call-seq:
9788 * rstrip -> new_string
9789 *
9790 * Returns a copy of the receiver with trailing whitespace removed;
9791 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
9792 *
9793 * whitespace = "\x00\t\n\v\f\r "
9794 * s = whitespace + 'abc' + whitespace
9795 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
9796 * s.rstrip # => "\u0000\t\n\v\f\r abc"
9797 *
9798 * Related: String#lstrip, String#strip.
9799 */
9800
9801static VALUE
9802rb_str_rstrip(VALUE str)
9803{
9804 rb_encoding *enc;
9805 char *start;
9806 long olen, roffset;
9807
9808 enc = STR_ENC_GET(str);
9809 RSTRING_GETMEM(str, start, olen);
9810 roffset = rstrip_offset(str, start, start+olen, enc);
9811
9812 if (roffset <= 0) return str_duplicate(rb_cString, str);
9813 return rb_str_subseq(str, 0, olen-roffset);
9814}
9815
9816
9817/*
9818 * call-seq:
9819 * strip! -> self or nil
9820 *
9821 * Like String#strip, except that any modifications are made in +self+;
9822 * returns +self+ if any modification are made, +nil+ otherwise.
9823 *
9824 * Related: String#lstrip!, String#strip!.
9825 */
9826
9827static VALUE
9828rb_str_strip_bang(VALUE str)
9829{
9830 char *start;
9831 long olen, loffset, roffset;
9832 rb_encoding *enc;
9833
9834 str_modify_keep_cr(str);
9835 enc = STR_ENC_GET(str);
9836 RSTRING_GETMEM(str, start, olen);
9837 loffset = lstrip_offset(str, start, start+olen, enc);
9838 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
9839
9840 if (loffset > 0 || roffset > 0) {
9841 long len = olen-roffset;
9842 if (loffset > 0) {
9843 len -= loffset;
9844 memmove(start, start + loffset, len);
9845 }
9846 STR_SET_LEN(str, len);
9847 TERM_FILL(start+len, rb_enc_mbminlen(enc));
9848 return str;
9849 }
9850 return Qnil;
9851}
9852
9853
9854/*
9855 * call-seq:
9856 * strip -> new_string
9857 *
9858 * Returns a copy of the receiver with leading and trailing whitespace removed;
9859 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
9860 *
9861 * whitespace = "\x00\t\n\v\f\r "
9862 * s = whitespace + 'abc' + whitespace
9863 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
9864 * s.strip # => "abc"
9865 *
9866 * Related: String#lstrip, String#rstrip.
9867 */
9868
9869static VALUE
9870rb_str_strip(VALUE str)
9871{
9872 char *start;
9873 long olen, loffset, roffset;
9874 rb_encoding *enc = STR_ENC_GET(str);
9875
9876 RSTRING_GETMEM(str, start, olen);
9877 loffset = lstrip_offset(str, start, start+olen, enc);
9878 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
9879
9880 if (loffset <= 0 && roffset <= 0) return str_duplicate(rb_cString, str);
9881 return rb_str_subseq(str, loffset, olen-loffset-roffset);
9882}
9883
9884static VALUE
9885scan_once(VALUE str, VALUE pat, long *start, int set_backref_str)
9886{
9887 VALUE result, match;
9888 struct re_registers *regs;
9889 int i;
9890 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
9891 if (pos >= 0) {
9892 if (BUILTIN_TYPE(pat) == T_STRING) {
9893 regs = NULL;
9894 end = pos + RSTRING_LEN(pat);
9895 }
9896 else {
9897 match = rb_backref_get();
9898 regs = RMATCH_REGS(match);
9899 pos = BEG(0);
9900 end = END(0);
9901 }
9902 if (pos == end) {
9903 rb_encoding *enc = STR_ENC_GET(str);
9904 /*
9905 * Always consume at least one character of the input string
9906 */
9907 if (RSTRING_LEN(str) > end)
9908 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
9909 RSTRING_END(str), enc);
9910 else
9911 *start = end + 1;
9912 }
9913 else {
9914 *start = end;
9915 }
9916 if (!regs || regs->num_regs == 1) {
9917 result = rb_str_subseq(str, pos, end - pos);
9918 return result;
9919 }
9920 result = rb_ary_new2(regs->num_regs);
9921 for (i=1; i < regs->num_regs; i++) {
9922 VALUE s = Qnil;
9923 if (BEG(i) >= 0) {
9924 s = rb_str_subseq(str, BEG(i), END(i)-BEG(i));
9925 }
9926 rb_ary_push(result, s);
9927 }
9928
9929 return result;
9930 }
9931 return Qnil;
9932}
9933
9934
9935/*
9936 * call-seq:
9937 * scan(string_or_regexp) -> array
9938 * scan(string_or_regexp) {|matches| ... } -> self
9939 *
9940 * Matches a pattern against +self+; the pattern is:
9941 *
9942 * - +string_or_regexp+ itself, if it is a Regexp.
9943 * - <tt>Regexp.quote(string_or_regexp)</tt>, if +string_or_regexp+ is a string.
9944 *
9945 * Iterates through +self+, generating a collection of matching results:
9946 *
9947 * - If the pattern contains no groups, each result is the
9948 * matched string, <code>$&</code>.
9949 * - If the pattern contains groups, each result is an array
9950 * containing one entry per group.
9951 *
9952 * With no block given, returns an array of the results:
9953 *
9954 * s = 'cruel world'
9955 * s.scan(/\w+/) # => ["cruel", "world"]
9956 * s.scan(/.../) # => ["cru", "el ", "wor"]
9957 * s.scan(/(...)/) # => [["cru"], ["el "], ["wor"]]
9958 * s.scan(/(..)(..)/) # => [["cr", "ue"], ["l ", "wo"]]
9959 *
9960 * With a block given, calls the block with each result; returns +self+:
9961 *
9962 * s.scan(/\w+/) {|w| print "<<#{w}>> " }
9963 * print "\n"
9964 * s.scan(/(.)(.)/) {|x,y| print y, x }
9965 * print "\n"
9966 *
9967 * Output:
9968 *
9969 * <<cruel>> <<world>>
9970 * rceu lowlr
9971 *
9972 */
9973
9974static VALUE
9975rb_str_scan(VALUE str, VALUE pat)
9976{
9977 VALUE result;
9978 long start = 0;
9979 long last = -1, prev = 0;
9980 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
9981
9982 pat = get_pat_quoted(pat, 1);
9983 mustnot_broken(str);
9984 if (!rb_block_given_p()) {
9985 VALUE ary = rb_ary_new();
9986
9987 while (!NIL_P(result = scan_once(str, pat, &start, 0))) {
9988 last = prev;
9989 prev = start;
9990 rb_ary_push(ary, result);
9991 }
9992 if (last >= 0) rb_pat_search(pat, str, last, 1);
9993 else rb_backref_set(Qnil);
9994 return ary;
9995 }
9996
9997 while (!NIL_P(result = scan_once(str, pat, &start, 1))) {
9998 last = prev;
9999 prev = start;
10000 rb_yield(result);
10001 str_mod_check(str, p, len);
10002 }
10003 if (last >= 0) rb_pat_search(pat, str, last, 1);
10004 return str;
10005}
10006
10007
10008/*
10009 * call-seq:
10010 * hex -> integer
10011 *
10012 * Interprets the leading substring of +self+ as a string of hexadecimal digits
10013 * (with an optional sign and an optional <code>0x</code>) and returns the
10014 * corresponding number;
10015 * returns zero if there is no such leading substring:
10016 *
10017 * '0x0a'.hex # => 10
10018 * '-1234'.hex # => -4660
10019 * '0'.hex # => 0
10020 * 'non-numeric'.hex # => 0
10021 *
10022 * Related: String#oct.
10023 *
10024 */
10025
10026static VALUE
10027rb_str_hex(VALUE str)
10028{
10029 return rb_str_to_inum(str, 16, FALSE);
10030}
10031
10032
10033/*
10034 * call-seq:
10035 * oct -> integer
10036 *
10037 * Interprets the leading substring of +self+ as a string of octal digits
10038 * (with an optional sign) and returns the corresponding number;
10039 * returns zero if there is no such leading substring:
10040 *
10041 * '123'.oct # => 83
10042 * '-377'.oct # => -255
10043 * '0377non-numeric'.oct # => 255
10044 * 'non-numeric'.oct # => 0
10045 *
10046 * If +self+ starts with <tt>0</tt>, radix indicators are honored;
10047 * see Kernel#Integer.
10048 *
10049 * Related: String#hex.
10050 *
10051 */
10052
10053static VALUE
10054rb_str_oct(VALUE str)
10055{
10056 return rb_str_to_inum(str, -8, FALSE);
10057}
10058
10059#ifndef HAVE_CRYPT_R
10060# include "ruby/thread_native.h"
10061# include "ruby/atomic.h"
10062
10063static struct {
10065} crypt_mutex = {PTHREAD_MUTEX_INITIALIZER};
10066
10067static void
10068crypt_mutex_initialize(void)
10069{
10070}
10071#endif
10072
10073/*
10074 * call-seq:
10075 * crypt(salt_str) -> new_string
10076 *
10077 * Returns the string generated by calling <code>crypt(3)</code>
10078 * standard library function with <code>str</code> and
10079 * <code>salt_str</code>, in this order, as its arguments. Please do
10080 * not use this method any longer. It is legacy; provided only for
10081 * backward compatibility with ruby scripts in earlier days. It is
10082 * bad to use in contemporary programs for several reasons:
10083 *
10084 * * Behaviour of C's <code>crypt(3)</code> depends on the OS it is
10085 * run. The generated string lacks data portability.
10086 *
10087 * * On some OSes such as Mac OS, <code>crypt(3)</code> never fails
10088 * (i.e. silently ends up in unexpected results).
10089 *
10090 * * On some OSes such as Mac OS, <code>crypt(3)</code> is not
10091 * thread safe.
10092 *
10093 * * So-called "traditional" usage of <code>crypt(3)</code> is very
10094 * very very weak. According to its manpage, Linux's traditional
10095 * <code>crypt(3)</code> output has only 2**56 variations; too
10096 * easy to brute force today. And this is the default behaviour.
10097 *
10098 * * In order to make things robust some OSes implement so-called
10099 * "modular" usage. To go through, you have to do a complex
10100 * build-up of the <code>salt_str</code> parameter, by hand.
10101 * Failure in generation of a proper salt string tends not to
10102 * yield any errors; typos in parameters are normally not
10103 * detectable.
10104 *
10105 * * For instance, in the following example, the second invocation
10106 * of String#crypt is wrong; it has a typo in "round=" (lacks
10107 * "s"). However the call does not fail and something unexpected
10108 * is generated.
10109 *
10110 * "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage
10111 * "foo".crypt("$5$round=1000$salt$") # Typo not detected
10112 *
10113 * * Even in the "modular" mode, some hash functions are considered
10114 * archaic and no longer recommended at all; for instance module
10115 * <code>$1$</code> is officially abandoned by its author: see
10116 * http://phk.freebsd.dk/sagas/md5crypt_eol/ . For another
10117 * instance module <code>$3$</code> is considered completely
10118 * broken: see the manpage of FreeBSD.
10119 *
10120 * * On some OS such as Mac OS, there is no modular mode. Yet, as
10121 * written above, <code>crypt(3)</code> on Mac OS never fails.
10122 * This means even if you build up a proper salt string it
10123 * generates a traditional DES hash anyways, and there is no way
10124 * for you to be aware of.
10125 *
10126 * "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6."
10127 *
10128 * If for some reason you cannot migrate to other secure contemporary
10129 * password hashing algorithms, install the string-crypt gem and
10130 * <code>require 'string/crypt'</code> to continue using it.
10131 */
10132
10133static VALUE
10134rb_str_crypt(VALUE str, VALUE salt)
10135{
10136#ifdef HAVE_CRYPT_R
10137 VALUE databuf;
10138 struct crypt_data *data;
10139# define CRYPT_END() ALLOCV_END(databuf)
10140#else
10141 extern char *crypt(const char *, const char *);
10142# define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10143#endif
10144 VALUE result;
10145 const char *s, *saltp;
10146 char *res;
10147#ifdef BROKEN_CRYPT
10148 char salt_8bit_clean[3];
10149#endif
10150
10151 StringValue(salt);
10152 mustnot_wchar(str);
10153 mustnot_wchar(salt);
10154 s = StringValueCStr(str);
10155 saltp = RSTRING_PTR(salt);
10156 if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10157 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
10158 }
10159
10160#ifdef BROKEN_CRYPT
10161 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
10162 salt_8bit_clean[0] = saltp[0] & 0x7f;
10163 salt_8bit_clean[1] = saltp[1] & 0x7f;
10164 salt_8bit_clean[2] = '\0';
10165 saltp = salt_8bit_clean;
10166 }
10167#endif
10168#ifdef HAVE_CRYPT_R
10169 data = ALLOCV(databuf, sizeof(struct crypt_data));
10170# ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10171 data->initialized = 0;
10172# endif
10173 res = crypt_r(s, saltp, data);
10174#else
10175 crypt_mutex_initialize();
10176 rb_nativethread_lock_lock(&crypt_mutex.lock);
10177 res = crypt(s, saltp);
10178#endif
10179 if (!res) {
10180 int err = errno;
10181 CRYPT_END();
10182 rb_syserr_fail(err, "crypt");
10183 }
10184 result = rb_str_new_cstr(res);
10185 CRYPT_END();
10186 return result;
10187}
10188
10189
10190/*
10191 * call-seq:
10192 * ord -> integer
10193 *
10194 * :include: doc/string/ord.rdoc
10195 *
10196 */
10197
10198static VALUE
10199rb_str_ord(VALUE s)
10200{
10201 unsigned int c;
10202
10203 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
10204 return UINT2NUM(c);
10205}
10206/*
10207 * call-seq:
10208 * sum(n = 16) -> integer
10209 *
10210 * :include: doc/string/sum.rdoc
10211 *
10212 */
10213
10214static VALUE
10215rb_str_sum(int argc, VALUE *argv, VALUE str)
10216{
10217 int bits = 16;
10218 char *ptr, *p, *pend;
10219 long len;
10220 VALUE sum = INT2FIX(0);
10221 unsigned long sum0 = 0;
10222
10223 if (rb_check_arity(argc, 0, 1) && (bits = NUM2INT(argv[0])) < 0) {
10224 bits = 0;
10225 }
10226 ptr = p = RSTRING_PTR(str);
10227 len = RSTRING_LEN(str);
10228 pend = p + len;
10229
10230 while (p < pend) {
10231 if (FIXNUM_MAX - UCHAR_MAX < sum0) {
10232 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10233 str_mod_check(str, ptr, len);
10234 sum0 = 0;
10235 }
10236 sum0 += (unsigned char)*p;
10237 p++;
10238 }
10239
10240 if (bits == 0) {
10241 if (sum0) {
10242 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10243 }
10244 }
10245 else {
10246 if (sum == INT2FIX(0)) {
10247 if (bits < (int)sizeof(long)*CHAR_BIT) {
10248 sum0 &= (((unsigned long)1)<<bits)-1;
10249 }
10250 sum = LONG2FIX(sum0);
10251 }
10252 else {
10253 VALUE mod;
10254
10255 if (sum0) {
10256 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10257 }
10258
10259 mod = rb_funcall(INT2FIX(1), idLTLT, 1, INT2FIX(bits));
10260 mod = rb_funcall(mod, '-', 1, INT2FIX(1));
10261 sum = rb_funcall(sum, '&', 1, mod);
10262 }
10263 }
10264 return sum;
10265}
10266
10267static VALUE
10268rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
10269{
10270 rb_encoding *enc;
10271 VALUE w;
10272 long width, len, flen = 1, fclen = 1;
10273 VALUE res;
10274 char *p;
10275 const char *f = " ";
10276 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
10277 VALUE pad;
10278 int singlebyte = 1, cr;
10279 int termlen;
10280
10281 rb_scan_args(argc, argv, "11", &w, &pad);
10282 enc = STR_ENC_GET(str);
10283 termlen = rb_enc_mbminlen(enc);
10284 width = NUM2LONG(w);
10285 if (argc == 2) {
10286 StringValue(pad);
10287 enc = rb_enc_check(str, pad);
10288 f = RSTRING_PTR(pad);
10289 flen = RSTRING_LEN(pad);
10290 fclen = str_strlen(pad, enc); /* rb_enc_check */
10291 singlebyte = single_byte_optimizable(pad);
10292 if (flen == 0 || fclen == 0) {
10293 rb_raise(rb_eArgError, "zero width padding");
10294 }
10295 }
10296 len = str_strlen(str, enc); /* rb_enc_check */
10297 if (width < 0 || len >= width) return str_duplicate(rb_cString, str);
10298 n = width - len;
10299 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
10300 rlen = n - llen;
10301 cr = ENC_CODERANGE(str);
10302 if (flen > 1) {
10303 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
10304 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
10305 }
10306 size = RSTRING_LEN(str);
10307 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
10308 (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
10309 (len += llen2 + rlen2) >= LONG_MAX - size) {
10310 rb_raise(rb_eArgError, "argument too big");
10311 }
10312 len += size;
10313 res = str_new0(rb_cString, 0, len, termlen);
10314 p = RSTRING_PTR(res);
10315 if (flen <= 1) {
10316 memset(p, *f, llen);
10317 p += llen;
10318 }
10319 else {
10320 while (llen >= fclen) {
10321 memcpy(p,f,flen);
10322 p += flen;
10323 llen -= fclen;
10324 }
10325 if (llen > 0) {
10326 memcpy(p, f, llen2);
10327 p += llen2;
10328 }
10329 }
10330 memcpy(p, RSTRING_PTR(str), size);
10331 p += size;
10332 if (flen <= 1) {
10333 memset(p, *f, rlen);
10334 p += rlen;
10335 }
10336 else {
10337 while (rlen >= fclen) {
10338 memcpy(p,f,flen);
10339 p += flen;
10340 rlen -= fclen;
10341 }
10342 if (rlen > 0) {
10343 memcpy(p, f, rlen2);
10344 p += rlen2;
10345 }
10346 }
10347 TERM_FILL(p, termlen);
10348 STR_SET_LEN(res, p-RSTRING_PTR(res));
10349 rb_enc_associate(res, enc);
10350 if (argc == 2)
10351 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
10352 if (cr != ENC_CODERANGE_BROKEN)
10353 ENC_CODERANGE_SET(res, cr);
10354
10355 RB_GC_GUARD(pad);
10356 return res;
10357}
10358
10359
10360/*
10361 * call-seq:
10362 * ljust(size, pad_string = ' ') -> new_string
10363 *
10364 * :include: doc/string/ljust.rdoc
10365 *
10366 * Related: String#rjust, String#center.
10367 *
10368 */
10369
10370static VALUE
10371rb_str_ljust(int argc, VALUE *argv, VALUE str)
10372{
10373 return rb_str_justify(argc, argv, str, 'l');
10374}
10375
10376/*
10377 * call-seq:
10378 * rjust(size, pad_string = ' ') -> new_string
10379 *
10380 * :include: doc/string/rjust.rdoc
10381 *
10382 * Related: String#ljust, String#center.
10383 *
10384 */
10385
10386static VALUE
10387rb_str_rjust(int argc, VALUE *argv, VALUE str)
10388{
10389 return rb_str_justify(argc, argv, str, 'r');
10390}
10391
10392
10393/*
10394 * call-seq:
10395 * center(size, pad_string = ' ') -> new_string
10396 *
10397 * :include: doc/string/center.rdoc
10398 *
10399 * Related: String#ljust, String#rjust.
10400 *
10401 */
10402
10403static VALUE
10404rb_str_center(int argc, VALUE *argv, VALUE str)
10405{
10406 return rb_str_justify(argc, argv, str, 'c');
10407}
10408
10409/*
10410 * call-seq:
10411 * partition(string_or_regexp) -> [head, match, tail]
10412 *
10413 * :include: doc/string/partition.rdoc
10414 *
10415 */
10416
10417static VALUE
10418rb_str_partition(VALUE str, VALUE sep)
10419{
10420 long pos;
10421
10422 sep = get_pat_quoted(sep, 0);
10423 if (RB_TYPE_P(sep, T_REGEXP)) {
10424 if (rb_reg_search(sep, str, 0, 0) < 0) {
10425 goto failed;
10426 }
10427 VALUE match = rb_backref_get();
10428 struct re_registers *regs = RMATCH_REGS(match);
10429
10430 pos = BEG(0);
10431 sep = rb_str_subseq(str, pos, END(0) - pos);
10432 }
10433 else {
10434 pos = rb_str_index(str, sep, 0);
10435 if (pos < 0) goto failed;
10436 }
10437 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
10438 sep,
10439 rb_str_subseq(str, pos+RSTRING_LEN(sep),
10440 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
10441
10442 failed:
10443 return rb_ary_new3(3, str_duplicate(rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
10444}
10445
10446/*
10447 * call-seq:
10448 * rpartition(sep) -> [head, match, tail]
10449 *
10450 * :include: doc/string/rpartition.rdoc
10451 *
10452 */
10453
10454static VALUE
10455rb_str_rpartition(VALUE str, VALUE sep)
10456{
10457 long pos = RSTRING_LEN(str);
10458
10459 sep = get_pat_quoted(sep, 0);
10460 if (RB_TYPE_P(sep, T_REGEXP)) {
10461 if (rb_reg_search(sep, str, pos, 1) < 0) {
10462 goto failed;
10463 }
10464 VALUE match = rb_backref_get();
10465 struct re_registers *regs = RMATCH_REGS(match);
10466
10467 pos = BEG(0);
10468 sep = rb_str_subseq(str, pos, END(0) - pos);
10469 }
10470 else {
10471 pos = rb_str_sublen(str, pos);
10472 pos = rb_str_rindex(str, sep, pos);
10473 if (pos < 0) {
10474 goto failed;
10475 }
10476 pos = rb_str_offset(str, pos);
10477 }
10478
10479 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
10480 sep,
10481 rb_str_subseq(str, pos+RSTRING_LEN(sep),
10482 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
10483 failed:
10484 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(rb_cString, str));
10485}
10486
10487/*
10488 * call-seq:
10489 * start_with?(*string_or_regexp) -> true or false
10490 *
10491 * :include: doc/string/start_with_p.rdoc
10492 *
10493 */
10494
10495static VALUE
10496rb_str_start_with(int argc, VALUE *argv, VALUE str)
10497{
10498 int i;
10499
10500 for (i=0; i<argc; i++) {
10501 VALUE tmp = argv[i];
10502 if (RB_TYPE_P(tmp, T_REGEXP)) {
10503 if (rb_reg_start_with_p(tmp, str))
10504 return Qtrue;
10505 }
10506 else {
10507 StringValue(tmp);
10508 rb_enc_check(str, tmp);
10509 if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
10510 if (memcmp(RSTRING_PTR(str), RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
10511 return Qtrue;
10512 }
10513 }
10514 return Qfalse;
10515}
10516
10517/*
10518 * call-seq:
10519 * end_with?(*strings) -> true or false
10520 *
10521 * :include: doc/string/end_with_p.rdoc
10522 *
10523 */
10524
10525static VALUE
10526rb_str_end_with(int argc, VALUE *argv, VALUE str)
10527{
10528 int i;
10529 char *p, *s, *e;
10530 rb_encoding *enc;
10531
10532 for (i=0; i<argc; i++) {
10533 VALUE tmp = argv[i];
10534 long slen, tlen;
10535 StringValue(tmp);
10536 enc = rb_enc_check(str, tmp);
10537 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
10538 if ((slen = RSTRING_LEN(str)) < tlen) continue;
10539 p = RSTRING_PTR(str);
10540 e = p + slen;
10541 s = e - tlen;
10542 if (rb_enc_left_char_head(p, s, e, enc) != s)
10543 continue;
10544 if (memcmp(s, RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
10545 return Qtrue;
10546 }
10547 return Qfalse;
10548}
10549
10559static long
10560deleted_prefix_length(VALUE str, VALUE prefix)
10561{
10562 char *strptr, *prefixptr;
10563 long olen, prefixlen;
10564
10565 StringValue(prefix);
10566 if (is_broken_string(prefix)) return 0;
10567 rb_enc_check(str, prefix);
10568
10569 /* return 0 if not start with prefix */
10570 prefixlen = RSTRING_LEN(prefix);
10571 if (prefixlen <= 0) return 0;
10572 olen = RSTRING_LEN(str);
10573 if (olen < prefixlen) return 0;
10574 strptr = RSTRING_PTR(str);
10575 prefixptr = RSTRING_PTR(prefix);
10576 if (memcmp(strptr, prefixptr, prefixlen) != 0) return 0;
10577
10578 return prefixlen;
10579}
10580
10581/*
10582 * call-seq:
10583 * delete_prefix!(prefix) -> self or nil
10584 *
10585 * Like String#delete_prefix, except that +self+ is modified in place.
10586 * Returns +self+ if the prefix is removed, +nil+ otherwise.
10587 *
10588 */
10589
10590static VALUE
10591rb_str_delete_prefix_bang(VALUE str, VALUE prefix)
10592{
10593 long prefixlen;
10594 str_modify_keep_cr(str);
10595
10596 prefixlen = deleted_prefix_length(str, prefix);
10597 if (prefixlen <= 0) return Qnil;
10598
10599 return rb_str_drop_bytes(str, prefixlen);
10600}
10601
10602/*
10603 * call-seq:
10604 * delete_prefix(prefix) -> new_string
10605 *
10606 * :include: doc/string/delete_prefix.rdoc
10607 *
10608 */
10609
10610static VALUE
10611rb_str_delete_prefix(VALUE str, VALUE prefix)
10612{
10613 long prefixlen;
10614
10615 prefixlen = deleted_prefix_length(str, prefix);
10616 if (prefixlen <= 0) return str_duplicate(rb_cString, str);
10617
10618 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
10619}
10620
10630static long
10631deleted_suffix_length(VALUE str, VALUE suffix)
10632{
10633 char *strptr, *suffixptr, *s;
10634 long olen, suffixlen;
10635 rb_encoding *enc;
10636
10637 StringValue(suffix);
10638 if (is_broken_string(suffix)) return 0;
10639 enc = rb_enc_check(str, suffix);
10640
10641 /* return 0 if not start with suffix */
10642 suffixlen = RSTRING_LEN(suffix);
10643 if (suffixlen <= 0) return 0;
10644 olen = RSTRING_LEN(str);
10645 if (olen < suffixlen) return 0;
10646 strptr = RSTRING_PTR(str);
10647 suffixptr = RSTRING_PTR(suffix);
10648 s = strptr + olen - suffixlen;
10649 if (memcmp(s, suffixptr, suffixlen) != 0) return 0;
10650 if (rb_enc_left_char_head(strptr, s, strptr + olen, enc) != s) return 0;
10651
10652 return suffixlen;
10653}
10654
10655/*
10656 * call-seq:
10657 * delete_suffix!(suffix) -> self or nil
10658 *
10659 * Like String#delete_suffix, except that +self+ is modified in place.
10660 * Returns +self+ if the suffix is removed, +nil+ otherwise.
10661 *
10662 */
10663
10664static VALUE
10665rb_str_delete_suffix_bang(VALUE str, VALUE suffix)
10666{
10667 long olen, suffixlen, len;
10668 str_modifiable(str);
10669
10670 suffixlen = deleted_suffix_length(str, suffix);
10671 if (suffixlen <= 0) return Qnil;
10672
10673 olen = RSTRING_LEN(str);
10674 str_modify_keep_cr(str);
10675 len = olen - suffixlen;
10676 STR_SET_LEN(str, len);
10677 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10678 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10680 }
10681 return str;
10682}
10683
10684/*
10685 * call-seq:
10686 * delete_suffix(suffix) -> new_string
10687 *
10688 * :include: doc/string/delete_suffix.rdoc
10689 *
10690 */
10691
10692static VALUE
10693rb_str_delete_suffix(VALUE str, VALUE suffix)
10694{
10695 long suffixlen;
10696
10697 suffixlen = deleted_suffix_length(str, suffix);
10698 if (suffixlen <= 0) return str_duplicate(rb_cString, str);
10699
10700 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
10701}
10702
10703void
10704rb_str_setter(VALUE val, ID id, VALUE *var)
10705{
10706 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
10707 rb_raise(rb_eTypeError, "value of %"PRIsVALUE" must be String", rb_id2str(id));
10708 }
10709 *var = val;
10710}
10711
10712static void
10713rb_fs_setter(VALUE val, ID id, VALUE *var)
10714{
10715 val = rb_fs_check(val);
10716 if (!val) {
10718 "value of %"PRIsVALUE" must be String or Regexp",
10719 rb_id2str(id));
10720 }
10721 if (!NIL_P(val)) {
10722 rb_warn_deprecated("`$;'", NULL);
10723 }
10724 *var = val;
10725}
10726
10727
10728/*
10729 * call-seq:
10730 * force_encoding(encoding) -> self
10731 *
10732 * :include: doc/string/force_encoding.rdoc
10733 *
10734 */
10735
10736static VALUE
10737rb_str_force_encoding(VALUE str, VALUE enc)
10738{
10739 str_modifiable(str);
10740 rb_enc_associate(str, rb_to_encoding(enc));
10742 return str;
10743}
10744
10745/*
10746 * call-seq:
10747 * b -> string
10748 *
10749 * :include: doc/string/b.rdoc
10750 *
10751 */
10752
10753static VALUE
10754rb_str_b(VALUE str)
10755{
10756 VALUE str2;
10757 if (FL_TEST(str, STR_NOEMBED)) {
10758 str2 = str_alloc_heap(rb_cString);
10759 }
10760 else {
10761 str2 = str_alloc_embed(rb_cString, RSTRING_EMBED_LEN(str) + TERM_LEN(str));
10762 }
10763 str_replace_shared_without_enc(str2, str);
10764
10765 if (rb_enc_asciicompat(STR_ENC_GET(str))) {
10766 // BINARY strings can never be broken; they're either 7-bit ASCII or VALID.
10767 // If we know the receiver's code range then we know the result's code range.
10768 int cr = ENC_CODERANGE(str);
10769 switch (cr) {
10770 case ENC_CODERANGE_7BIT:
10772 break;
10776 break;
10777 default:
10778 ENC_CODERANGE_CLEAR(str2);
10779 break;
10780 }
10781 }
10782
10783 return str2;
10784}
10785
10786/*
10787 * call-seq:
10788 * valid_encoding? -> true or false
10789 *
10790 * Returns +true+ if +self+ is encoded correctly, +false+ otherwise:
10791 *
10792 * "\xc2\xa1".force_encoding("UTF-8").valid_encoding? # => true
10793 * "\xc2".force_encoding("UTF-8").valid_encoding? # => false
10794 * "\x80".force_encoding("UTF-8").valid_encoding? # => false
10795 */
10796
10797static VALUE
10798rb_str_valid_encoding_p(VALUE str)
10799{
10800 int cr = rb_enc_str_coderange(str);
10801
10802 return RBOOL(cr != ENC_CODERANGE_BROKEN);
10803}
10804
10805/*
10806 * call-seq:
10807 * ascii_only? -> true or false
10808 *
10809 * Returns +true+ if +self+ contains only ASCII characters,
10810 * +false+ otherwise:
10811 *
10812 * 'abc'.ascii_only? # => true
10813 * "abc\u{6666}".ascii_only? # => false
10814 *
10815 */
10816
10817static VALUE
10818rb_str_is_ascii_only_p(VALUE str)
10819{
10820 int cr = rb_enc_str_coderange(str);
10821
10822 return RBOOL(cr == ENC_CODERANGE_7BIT);
10823}
10824
10825VALUE
10827{
10828 static const char ellipsis[] = "...";
10829 const long ellipsislen = sizeof(ellipsis) - 1;
10830 rb_encoding *const enc = rb_enc_get(str);
10831 const long blen = RSTRING_LEN(str);
10832 const char *const p = RSTRING_PTR(str), *e = p + blen;
10833 VALUE estr, ret = 0;
10834
10835 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
10836 if (len * rb_enc_mbminlen(enc) >= blen ||
10837 (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
10838 ret = str;
10839 }
10840 else if (len <= ellipsislen ||
10841 !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
10842 if (rb_enc_asciicompat(enc)) {
10843 ret = rb_str_new(ellipsis, len);
10844 rb_enc_associate(ret, enc);
10845 }
10846 else {
10847 estr = rb_usascii_str_new(ellipsis, len);
10848 ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
10849 }
10850 }
10851 else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
10852 rb_str_cat(ret, ellipsis, ellipsislen);
10853 }
10854 else {
10855 estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
10856 rb_enc_from_encoding(enc), 0, Qnil);
10857 rb_str_append(ret, estr);
10858 }
10859 return ret;
10860}
10861
10862static VALUE
10863str_compat_and_valid(VALUE str, rb_encoding *enc)
10864{
10865 int cr;
10866 str = StringValue(str);
10867 cr = rb_enc_str_coderange(str);
10868 if (cr == ENC_CODERANGE_BROKEN) {
10869 rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
10870 }
10871 else {
10872 rb_encoding *e = STR_ENC_GET(str);
10873 if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) {
10874 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
10875 rb_enc_name(enc), rb_enc_name(e));
10876 }
10877 }
10878 return str;
10879}
10880
10881static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr);
10882
10883VALUE
10885{
10886 rb_encoding *enc = STR_ENC_GET(str);
10887 return enc_str_scrub(enc, str, repl, ENC_CODERANGE(str));
10888}
10889
10890VALUE
10891rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
10892{
10893 int cr = ENC_CODERANGE_UNKNOWN;
10894 if (enc == STR_ENC_GET(str)) {
10895 /* cached coderange makes sense only when enc equals the
10896 * actual encoding of str */
10897 cr = ENC_CODERANGE(str);
10898 }
10899 return enc_str_scrub(enc, str, repl, cr);
10900}
10901
10902static VALUE
10903enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
10904{
10905 int encidx;
10906 VALUE buf = Qnil;
10907 const char *rep, *p, *e, *p1, *sp;
10908 long replen = -1;
10909 long slen;
10910
10911 if (rb_block_given_p()) {
10912 if (!NIL_P(repl))
10913 rb_raise(rb_eArgError, "both of block and replacement given");
10914 replen = 0;
10915 }
10916
10917 if (ENC_CODERANGE_CLEAN_P(cr))
10918 return Qnil;
10919
10920 if (!NIL_P(repl)) {
10921 repl = str_compat_and_valid(repl, enc);
10922 }
10923
10924 if (rb_enc_dummy_p(enc)) {
10925 return Qnil;
10926 }
10927 encidx = rb_enc_to_index(enc);
10928
10929#define DEFAULT_REPLACE_CHAR(str) do { \
10930 static const char replace[sizeof(str)-1] = str; \
10931 rep = replace; replen = (int)sizeof(replace); \
10932 } while (0)
10933
10934 slen = RSTRING_LEN(str);
10935 p = RSTRING_PTR(str);
10936 e = RSTRING_END(str);
10937 p1 = p;
10938 sp = p;
10939
10940 if (rb_enc_asciicompat(enc)) {
10941 int rep7bit_p;
10942 if (!replen) {
10943 rep = NULL;
10944 rep7bit_p = FALSE;
10945 }
10946 else if (!NIL_P(repl)) {
10947 rep = RSTRING_PTR(repl);
10948 replen = RSTRING_LEN(repl);
10949 rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
10950 }
10951 else if (encidx == rb_utf8_encindex()) {
10952 DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
10953 rep7bit_p = FALSE;
10954 }
10955 else {
10956 DEFAULT_REPLACE_CHAR("?");
10957 rep7bit_p = TRUE;
10958 }
10959 cr = ENC_CODERANGE_7BIT;
10960
10961 p = search_nonascii(p, e);
10962 if (!p) {
10963 p = e;
10964 }
10965 while (p < e) {
10966 int ret = rb_enc_precise_mbclen(p, e, enc);
10967 if (MBCLEN_NEEDMORE_P(ret)) {
10968 break;
10969 }
10970 else if (MBCLEN_CHARFOUND_P(ret)) {
10972 p += MBCLEN_CHARFOUND_LEN(ret);
10973 }
10974 else if (MBCLEN_INVALID_P(ret)) {
10975 /*
10976 * p1~p: valid ascii/multibyte chars
10977 * p ~e: invalid bytes + unknown bytes
10978 */
10979 long clen = rb_enc_mbmaxlen(enc);
10980 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
10981 if (p > p1) {
10982 rb_str_buf_cat(buf, p1, p - p1);
10983 }
10984
10985 if (e - p < clen) clen = e - p;
10986 if (clen <= 2) {
10987 clen = 1;
10988 }
10989 else {
10990 const char *q = p;
10991 clen--;
10992 for (; clen > 1; clen--) {
10993 ret = rb_enc_precise_mbclen(q, q + clen, enc);
10994 if (MBCLEN_NEEDMORE_P(ret)) break;
10995 if (MBCLEN_INVALID_P(ret)) continue;
10997 }
10998 }
10999 if (rep) {
11000 rb_str_buf_cat(buf, rep, replen);
11001 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11002 }
11003 else {
11004 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11005 str_mod_check(str, sp, slen);
11006 repl = str_compat_and_valid(repl, enc);
11007 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11010 }
11011 p += clen;
11012 p1 = p;
11013 p = search_nonascii(p, e);
11014 if (!p) {
11015 p = e;
11016 break;
11017 }
11018 }
11019 else {
11021 }
11022 }
11023 if (NIL_P(buf)) {
11024 if (p == e) {
11025 ENC_CODERANGE_SET(str, cr);
11026 return Qnil;
11027 }
11028 buf = rb_str_buf_new(RSTRING_LEN(str));
11029 }
11030 if (p1 < p) {
11031 rb_str_buf_cat(buf, p1, p - p1);
11032 }
11033 if (p < e) {
11034 if (rep) {
11035 rb_str_buf_cat(buf, rep, replen);
11036 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11037 }
11038 else {
11039 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11040 str_mod_check(str, sp, slen);
11041 repl = str_compat_and_valid(repl, enc);
11042 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11045 }
11046 }
11047 }
11048 else {
11049 /* ASCII incompatible */
11050 long mbminlen = rb_enc_mbminlen(enc);
11051 if (!replen) {
11052 rep = NULL;
11053 }
11054 else if (!NIL_P(repl)) {
11055 rep = RSTRING_PTR(repl);
11056 replen = RSTRING_LEN(repl);
11057 }
11058 else if (encidx == ENCINDEX_UTF_16BE) {
11059 DEFAULT_REPLACE_CHAR("\xFF\xFD");
11060 }
11061 else if (encidx == ENCINDEX_UTF_16LE) {
11062 DEFAULT_REPLACE_CHAR("\xFD\xFF");
11063 }
11064 else if (encidx == ENCINDEX_UTF_32BE) {
11065 DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
11066 }
11067 else if (encidx == ENCINDEX_UTF_32LE) {
11068 DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
11069 }
11070 else {
11071 DEFAULT_REPLACE_CHAR("?");
11072 }
11073
11074 while (p < e) {
11075 int ret = rb_enc_precise_mbclen(p, e, enc);
11076 if (MBCLEN_NEEDMORE_P(ret)) {
11077 break;
11078 }
11079 else if (MBCLEN_CHARFOUND_P(ret)) {
11080 p += MBCLEN_CHARFOUND_LEN(ret);
11081 }
11082 else if (MBCLEN_INVALID_P(ret)) {
11083 const char *q = p;
11084 long clen = rb_enc_mbmaxlen(enc);
11085 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11086 if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
11087
11088 if (e - p < clen) clen = e - p;
11089 if (clen <= mbminlen * 2) {
11090 clen = mbminlen;
11091 }
11092 else {
11093 clen -= mbminlen;
11094 for (; clen > mbminlen; clen-=mbminlen) {
11095 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11096 if (MBCLEN_NEEDMORE_P(ret)) break;
11097 if (MBCLEN_INVALID_P(ret)) continue;
11099 }
11100 }
11101 if (rep) {
11102 rb_str_buf_cat(buf, rep, replen);
11103 }
11104 else {
11105 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11106 str_mod_check(str, sp, slen);
11107 repl = str_compat_and_valid(repl, enc);
11108 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11109 }
11110 p += clen;
11111 p1 = p;
11112 }
11113 else {
11115 }
11116 }
11117 if (NIL_P(buf)) {
11118 if (p == e) {
11120 return Qnil;
11121 }
11122 buf = rb_str_buf_new(RSTRING_LEN(str));
11123 }
11124 if (p1 < p) {
11125 rb_str_buf_cat(buf, p1, p - p1);
11126 }
11127 if (p < e) {
11128 if (rep) {
11129 rb_str_buf_cat(buf, rep, replen);
11130 }
11131 else {
11132 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11133 str_mod_check(str, sp, slen);
11134 repl = str_compat_and_valid(repl, enc);
11135 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11136 }
11137 }
11139 }
11140 ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
11141 return buf;
11142}
11143
11144/*
11145 * call-seq:
11146 * scrub(replacement_string = default_replacement) -> new_string
11147 * scrub{|bytes| ... } -> new_string
11148 *
11149 * :include: doc/string/scrub.rdoc
11150 *
11151 */
11152static VALUE
11153str_scrub(int argc, VALUE *argv, VALUE str)
11154{
11155 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11156 VALUE new = rb_str_scrub(str, repl);
11157 return NIL_P(new) ? str_duplicate(rb_cString, str): new;
11158}
11159
11160/*
11161 * call-seq:
11162 * scrub! -> self
11163 * scrub!(replacement_string = default_replacement) -> self
11164 * scrub!{|bytes| ... } -> self
11165 *
11166 * Like String#scrub, except that any replacements are made in +self+.
11167 *
11168 */
11169static VALUE
11170str_scrub_bang(int argc, VALUE *argv, VALUE str)
11171{
11172 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11173 VALUE new = rb_str_scrub(str, repl);
11174 if (!NIL_P(new)) rb_str_replace(str, new);
11175 return str;
11176}
11177
11178static ID id_normalize;
11179static ID id_normalized_p;
11180static VALUE mUnicodeNormalize;
11181
11182static VALUE
11183unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id)
11184{
11185 static int UnicodeNormalizeRequired = 0;
11186 VALUE argv2[2];
11187
11188 if (!UnicodeNormalizeRequired) {
11189 rb_require("unicode_normalize/normalize.rb");
11190 UnicodeNormalizeRequired = 1;
11191 }
11192 argv2[0] = str;
11193 if (rb_check_arity(argc, 0, 1)) argv2[1] = argv[0];
11194 return rb_funcallv(mUnicodeNormalize, id, argc+1, argv2);
11195}
11196
11197/*
11198 * call-seq:
11199 * unicode_normalize(form = :nfc) -> string
11200 *
11201 * Returns a copy of +self+ with
11202 * {Unicode normalization}[https://unicode.org/reports/tr15] applied.
11203 *
11204 * Argument +form+ must be one of the following symbols
11205 * (see {Unicode normalization forms}[https://unicode.org/reports/tr15/#Norm_Forms]):
11206 *
11207 * - +:nfc+: Canonical decomposition, followed by canonical composition.
11208 * - +:nfd+: Canonical decomposition.
11209 * - +:nfkc+: Compatibility decomposition, followed by canonical composition.
11210 * - +:nfkd+: Compatibility decomposition.
11211 *
11212 * The encoding of +self+ must be one of:
11213 *
11214 * - Encoding::UTF_8
11215 * - Encoding::UTF_16BE
11216 * - Encoding::UTF_16LE
11217 * - Encoding::UTF_32BE
11218 * - Encoding::UTF_32LE
11219 * - Encoding::GB18030
11220 * - Encoding::UCS_2BE
11221 * - Encoding::UCS_4BE
11222 *
11223 * Examples:
11224 *
11225 * "a\u0300".unicode_normalize # => "a"
11226 * "\u00E0".unicode_normalize(:nfd) # => "a "
11227 *
11228 * Related: String#unicode_normalize!, String#unicode_normalized?.
11229 */
11230static VALUE
11231rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
11232{
11233 return unicode_normalize_common(argc, argv, str, id_normalize);
11234}
11235
11236/*
11237 * call-seq:
11238 * unicode_normalize!(form = :nfc) -> self
11239 *
11240 * Like String#unicode_normalize, except that the normalization
11241 * is performed on +self+.
11242 *
11243 * Related String#unicode_normalized?.
11244 *
11245 */
11246static VALUE
11247rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str)
11248{
11249 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
11250}
11251
11252/* call-seq:
11253 * unicode_normalized?(form = :nfc) -> true or false
11254 *
11255 * Returns +true+ if +self+ is in the given +form+ of Unicode normalization,
11256 * +false+ otherwise.
11257 * The +form+ must be one of +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
11258 *
11259 * Examples:
11260 *
11261 * "a\u0300".unicode_normalized? # => false
11262 * "a\u0300".unicode_normalized?(:nfd) # => true
11263 * "\u00E0".unicode_normalized? # => true
11264 * "\u00E0".unicode_normalized?(:nfd) # => false
11265 *
11266 *
11267 * Raises an exception if +self+ is not in a Unicode encoding:
11268 *
11269 * s = "\xE0".force_encoding('ISO-8859-1')
11270 * s.unicode_normalized? # Raises Encoding::CompatibilityError.
11271 *
11272 * Related: String#unicode_normalize, String#unicode_normalize!.
11273 *
11274 */
11275static VALUE
11276rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str)
11277{
11278 return unicode_normalize_common(argc, argv, str, id_normalized_p);
11279}
11280
11281/**********************************************************************
11282 * Document-class: Symbol
11283 *
11284 * Symbol objects represent named identifiers inside the Ruby interpreter.
11285 *
11286 * You can create a \Symbol object explicitly with:
11287 *
11288 * - A {symbol literal}[rdoc-ref:syntax/literals.rdoc@Symbol+Literals].
11289 *
11290 * The same Symbol object will be
11291 * created for a given name or string for the duration of a program's
11292 * execution, regardless of the context or meaning of that name. Thus
11293 * if <code>Fred</code> is a constant in one context, a method in
11294 * another, and a class in a third, the Symbol <code>:Fred</code>
11295 * will be the same object in all three contexts.
11296 *
11297 * module One
11298 * class Fred
11299 * end
11300 * $f1 = :Fred
11301 * end
11302 * module Two
11303 * Fred = 1
11304 * $f2 = :Fred
11305 * end
11306 * def Fred()
11307 * end
11308 * $f3 = :Fred
11309 * $f1.object_id #=> 2514190
11310 * $f2.object_id #=> 2514190
11311 * $f3.object_id #=> 2514190
11312 *
11313 * Constant, method, and variable names are returned as symbols:
11314 *
11315 * module One
11316 * Two = 2
11317 * def three; 3 end
11318 * @four = 4
11319 * @@five = 5
11320 * $six = 6
11321 * end
11322 * seven = 7
11323 *
11324 * One.constants
11325 * # => [:Two]
11326 * One.instance_methods(true)
11327 * # => [:three]
11328 * One.instance_variables
11329 * # => [:@four]
11330 * One.class_variables
11331 * # => [:@@five]
11332 * global_variables.grep(/six/)
11333 * # => [:$six]
11334 * local_variables
11335 * # => [:seven]
11336 *
11337 * Symbol objects are different from String objects in that
11338 * Symbol objects represent identifiers, while String objects
11339 * represent text or data.
11340 *
11341 * == What's Here
11342 *
11343 * First, what's elsewhere. \Class \Symbol:
11344 *
11345 * - Inherits from {class Object}[rdoc-ref:Object@What-27s+Here].
11346 * - Includes {module Comparable}[rdoc-ref:Comparable@What-27s+Here].
11347 *
11348 * Here, class \Symbol provides methods that are useful for:
11349 *
11350 * - {Querying}[rdoc-ref:Symbol@Methods+for+Querying]
11351 * - {Comparing}[rdoc-ref:Symbol@Methods+for+Comparing]
11352 * - {Converting}[rdoc-ref:Symbol@Methods+for+Converting]
11353 *
11354 * === Methods for Querying
11355 *
11356 * - ::all_symbols: Returns an array of the symbols currently in Ruby's symbol table.
11357 * - #=~: Returns the index of the first substring in symbol that matches a
11358 * given Regexp or other object; returns +nil+ if no match is found.
11359 * - #[], #slice : Returns a substring of symbol
11360 * determined by a given index, start/length, or range, or string.
11361 * - #empty?: Returns +true+ if +self.length+ is zero; +false+ otherwise.
11362 * - #encoding: Returns the Encoding object that represents the encoding
11363 * of symbol.
11364 * - #end_with?: Returns +true+ if symbol ends with
11365 * any of the given strings.
11366 * - #match: Returns a MatchData object if symbol
11367 * matches a given Regexp; +nil+ otherwise.
11368 * - #match?: Returns +true+ if symbol
11369 * matches a given Regexp; +false+ otherwise.
11370 * - #length, #size: Returns the number of characters in symbol.
11371 * - #start_with?: Returns +true+ if symbol starts with
11372 * any of the given strings.
11373 *
11374 * === Methods for Comparing
11375 *
11376 * - #<=>: Returns -1, 0, or 1 as a given symbol is smaller than, equal to,
11377 * or larger than symbol.
11378 * - #==, #===: Returns +true+ if a given symbol has the same content and
11379 * encoding.
11380 * - #casecmp: Ignoring case, returns -1, 0, or 1 as a given
11381 * symbol is smaller than, equal to, or larger than symbol.
11382 * - #casecmp?: Returns +true+ if symbol is equal to a given symbol
11383 * after Unicode case folding; +false+ otherwise.
11384 *
11385 * === Methods for Converting
11386 *
11387 * - #capitalize: Returns symbol with the first character upcased
11388 * and all other characters downcased.
11389 * - #downcase: Returns symbol with all characters downcased.
11390 * - #inspect: Returns the string representation of +self+ as a symbol literal.
11391 * - #name: Returns the frozen string corresponding to symbol.
11392 * - #succ, #next: Returns the symbol that is the successor to symbol.
11393 * - #swapcase: Returns symbol with all upcase characters downcased
11394 * and all downcase characters upcased.
11395 * - #to_proc: Returns a Proc object which responds to the method named by symbol.
11396 * - #to_s, #id2name: Returns the string corresponding to +self+.
11397 * - #to_sym, #intern: Returns +self+.
11398 * - #upcase: Returns symbol with all characters upcased.
11399 *
11400 */
11401
11402
11403/*
11404 * call-seq:
11405 * symbol == object -> true or false
11406 *
11407 * Returns +true+ if +object+ is the same object as +self+, +false+ otherwise.
11408 *
11409 * Symbol#=== is an alias for Symbol#==.
11410 *
11411 */
11412
11413#define sym_equal rb_obj_equal
11414
11415static int
11416sym_printable(const char *s, const char *send, rb_encoding *enc)
11417{
11418 while (s < send) {
11419 int n;
11420 int c = rb_enc_precise_mbclen(s, send, enc);
11421
11422 if (!MBCLEN_CHARFOUND_P(c)) return FALSE;
11423 n = MBCLEN_CHARFOUND_LEN(c);
11424 c = rb_enc_mbc_to_codepoint(s, send, enc);
11425 if (!rb_enc_isprint(c, enc)) return FALSE;
11426 s += n;
11427 }
11428 return TRUE;
11429}
11430
11431int
11432rb_str_symname_p(VALUE sym)
11433{
11434 rb_encoding *enc;
11435 const char *ptr;
11436 long len;
11437 rb_encoding *resenc = rb_default_internal_encoding();
11438
11439 if (resenc == NULL) resenc = rb_default_external_encoding();
11440 enc = STR_ENC_GET(sym);
11441 ptr = RSTRING_PTR(sym);
11442 len = RSTRING_LEN(sym);
11443 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
11444 !rb_enc_symname2_p(ptr, len, enc) || !sym_printable(ptr, ptr + len, enc)) {
11445 return FALSE;
11446 }
11447 return TRUE;
11448}
11449
11450VALUE
11451rb_str_quote_unprintable(VALUE str)
11452{
11453 rb_encoding *enc;
11454 const char *ptr;
11455 long len;
11456 rb_encoding *resenc;
11457
11458 Check_Type(str, T_STRING);
11459 resenc = rb_default_internal_encoding();
11460 if (resenc == NULL) resenc = rb_default_external_encoding();
11461 enc = STR_ENC_GET(str);
11462 ptr = RSTRING_PTR(str);
11463 len = RSTRING_LEN(str);
11464 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
11465 !sym_printable(ptr, ptr + len, enc)) {
11466 return rb_str_escape(str);
11467 }
11468 return str;
11469}
11470
11471MJIT_FUNC_EXPORTED VALUE
11472rb_id_quote_unprintable(ID id)
11473{
11474 VALUE str = rb_id2str(id);
11475 if (!rb_str_symname_p(str)) {
11476 return rb_str_escape(str);
11477 }
11478 return str;
11479}
11480
11481/*
11482 * call-seq:
11483 * inspect -> string
11484 *
11485 * Returns a string representation of +self+ (including the leading colon):
11486 *
11487 * :foo.inspect # => ":foo"
11488 *
11489 * Related: Symbol#to_s, Symbol#name.
11490 *
11491 */
11492
11493static VALUE
11494sym_inspect(VALUE sym)
11495{
11496 VALUE str = rb_sym2str(sym);
11497 const char *ptr;
11498 long len;
11499 char *dest;
11500
11501 if (!rb_str_symname_p(str)) {
11502 str = rb_str_inspect(str);
11503 len = RSTRING_LEN(str);
11504 rb_str_resize(str, len + 1);
11505 dest = RSTRING_PTR(str);
11506 memmove(dest + 1, dest, len);
11507 }
11508 else {
11509 rb_encoding *enc = STR_ENC_GET(str);
11510 RSTRING_GETMEM(str, ptr, len);
11511 str = rb_enc_str_new(0, len + 1, enc);
11512 dest = RSTRING_PTR(str);
11513 memcpy(dest + 1, ptr, len);
11514 }
11515 dest[0] = ':';
11516 return str;
11517}
11518
11519/*
11520 * call-seq:
11521 * to_s -> string
11522 *
11523 * Returns a string representation of +self+ (not including the leading colon):
11524 *
11525 * :foo.to_s # => "foo"
11526 *
11527 * Symbol#id2name is an alias for Symbol#to_s.
11528 *
11529 * Related: Symbol#inspect, Symbol#name.
11530 */
11531
11532VALUE
11534{
11535 return str_new_shared(rb_cString, rb_sym2str(sym));
11536}
11537
11538MJIT_FUNC_EXPORTED VALUE
11539rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc)
11540{
11541 VALUE obj;
11542
11543 if (argc < 1) {
11544 rb_raise(rb_eArgError, "no receiver given");
11545 }
11546 obj = argv[0];
11547 return rb_funcall_with_block_kw(obj, mid, argc - 1, argv + 1, passed_proc, kw_splat);
11548}
11549
11550/*
11551 * call-seq:
11552 * succ
11553 *
11554 * Equivalent to <tt>self.to_s.succ.to_sym</tt>:
11555 *
11556 * :foo.succ # => :fop
11557 *
11558 * Symbol#next is an alias for Symbol#succ.
11559 *
11560 * Related: String#succ.
11561 */
11562
11563static VALUE
11564sym_succ(VALUE sym)
11565{
11566 return rb_str_intern(rb_str_succ(rb_sym2str(sym)));
11567}
11568
11569/*
11570 * call-seq:
11571 * symbol <=> object -> -1, 0, +1, or nil
11572 *
11573 * If +object+ is a symbol,
11574 * returns the equivalent of <tt>symbol.to_s <=> object.to_s</tt>:
11575 *
11576 * :bar <=> :foo # => -1
11577 * :foo <=> :foo # => 0
11578 * :foo <=> :bar # => 1
11579 *
11580 * Otherwise, returns +nil+:
11581 *
11582 * :foo <=> 'bar' # => nil
11583 *
11584 * Related: String#<=>.
11585 */
11586
11587static VALUE
11588sym_cmp(VALUE sym, VALUE other)
11589{
11590 if (!SYMBOL_P(other)) {
11591 return Qnil;
11592 }
11593 return rb_str_cmp_m(rb_sym2str(sym), rb_sym2str(other));
11594}
11595
11596/*
11597 * call-seq:
11598 * casecmp(object) -> -1, 0, 1, or nil
11599 *
11600 * :include: doc/symbol/casecmp.rdoc
11601 *
11602 */
11603
11604static VALUE
11605sym_casecmp(VALUE sym, VALUE other)
11606{
11607 if (!SYMBOL_P(other)) {
11608 return Qnil;
11609 }
11610 return str_casecmp(rb_sym2str(sym), rb_sym2str(other));
11611}
11612
11613/*
11614 * call-seq:
11615 * casecmp?(object) -> true, false, or nil
11616 *
11617 * :include: doc/symbol/casecmp_p.rdoc
11618 *
11619 */
11620
11621static VALUE
11622sym_casecmp_p(VALUE sym, VALUE other)
11623{
11624 if (!SYMBOL_P(other)) {
11625 return Qnil;
11626 }
11627 return str_casecmp_p(rb_sym2str(sym), rb_sym2str(other));
11628}
11629
11630/*
11631 * call-seq:
11632 * symbol =~ object -> integer or nil
11633 *
11634 * Equivalent to <tt>symbol.to_s =~ object</tt>,
11635 * including possible updates to global variables;
11636 * see String#=~.
11637 *
11638 */
11639
11640static VALUE
11641sym_match(VALUE sym, VALUE other)
11642{
11643 return rb_str_match(rb_sym2str(sym), other);
11644}
11645
11646/*
11647 * call-seq:
11648 * match(pattern, offset = 0) -> matchdata or nil
11649 * match(pattern, offset = 0) {|matchdata| } -> object
11650 *
11651 * Equivalent to <tt>self.to_s.match</tt>,
11652 * including possible updates to global variables;
11653 * see String#match.
11654 *
11655 */
11656
11657static VALUE
11658sym_match_m(int argc, VALUE *argv, VALUE sym)
11659{
11660 return rb_str_match_m(argc, argv, rb_sym2str(sym));
11661}
11662
11663/*
11664 * call-seq:
11665 * match?(pattern, offset) -> true or false
11666 *
11667 * Equivalent to <tt>sym.to_s.match?</tt>;
11668 * see String#match.
11669 *
11670 */
11671
11672static VALUE
11673sym_match_m_p(int argc, VALUE *argv, VALUE sym)
11674{
11675 return rb_str_match_m_p(argc, argv, sym);
11676}
11677
11678/*
11679 * call-seq:
11680 * symbol[index] -> string or nil
11681 * symbol[start, length] -> string or nil
11682 * symbol[range] -> string or nil
11683 * symbol[regexp, capture = 0] -> string or nil
11684 * symbol[substring] -> string or nil
11685 *
11686 * Equivalent to <tt>symbol.to_s[]</tt>; see String#[].
11687 *
11688 */
11689
11690static VALUE
11691sym_aref(int argc, VALUE *argv, VALUE sym)
11692{
11693 return rb_str_aref_m(argc, argv, rb_sym2str(sym));
11694}
11695
11696/*
11697 * call-seq:
11698 * length -> integer
11699 *
11700 * Equivalent to <tt>self.to_s.length</tt>; see String#length.
11701 *
11702 * Symbol#size is an alias for Symbol#length.
11703 *
11704 */
11705
11706static VALUE
11707sym_length(VALUE sym)
11708{
11709 return rb_str_length(rb_sym2str(sym));
11710}
11711
11712/*
11713 * call-seq:
11714 * empty? -> true or false
11715 *
11716 * Returns +true+ if +self+ is <tt>:''</tt>, +false+ otherwise.
11717 *
11718 */
11719
11720static VALUE
11721sym_empty(VALUE sym)
11722{
11723 return rb_str_empty(rb_sym2str(sym));
11724}
11725
11726/*
11727 * call-seq:
11728 * upcase(*options) -> symbol
11729 *
11730 * Equivalent to <tt>sym.to_s.upcase.to_sym</tt>.
11731 *
11732 * See String#upcase.
11733 *
11734 */
11735
11736static VALUE
11737sym_upcase(int argc, VALUE *argv, VALUE sym)
11738{
11739 return rb_str_intern(rb_str_upcase(argc, argv, rb_sym2str(sym)));
11740}
11741
11742/*
11743 * call-seq:
11744 * downcase(*options) -> symbol
11745 *
11746 * Equivalent to <tt>sym.to_s.downcase.to_sym</tt>.
11747 *
11748 * See String#downcase.
11749 *
11750 * Related: Symbol#upcase.
11751 *
11752 */
11753
11754static VALUE
11755sym_downcase(int argc, VALUE *argv, VALUE sym)
11756{
11757 return rb_str_intern(rb_str_downcase(argc, argv, rb_sym2str(sym)));
11758}
11759
11760/*
11761 * call-seq:
11762 * capitalize(*options) -> symbol
11763 *
11764 * Equivalent to <tt>sym.to_s.capitalize.to_sym</tt>.
11765 *
11766 * See String#capitalize.
11767 *
11768 */
11769
11770static VALUE
11771sym_capitalize(int argc, VALUE *argv, VALUE sym)
11772{
11773 return rb_str_intern(rb_str_capitalize(argc, argv, rb_sym2str(sym)));
11774}
11775
11776/*
11777 * call-seq:
11778 * swapcase(*options) -> symbol
11779 *
11780 * Equivalent to <tt>sym.to_s.swapcase.to_sym</tt>.
11781 *
11782 * See String#swapcase.
11783 *
11784 */
11785
11786static VALUE
11787sym_swapcase(int argc, VALUE *argv, VALUE sym)
11788{
11789 return rb_str_intern(rb_str_swapcase(argc, argv, rb_sym2str(sym)));
11790}
11791
11792/*
11793 * call-seq:
11794 * start_with?(*string_or_regexp) -> true or false
11795 *
11796 * Equivalent to <tt>self.to_s.start_with?</tt>; see String#start_with?.
11797 *
11798 */
11799
11800static VALUE
11801sym_start_with(int argc, VALUE *argv, VALUE sym)
11802{
11803 return rb_str_start_with(argc, argv, rb_sym2str(sym));
11804}
11805
11806/*
11807 * call-seq:
11808 * end_with?(*string_or_regexp) -> true or false
11809 *
11810 *
11811 * Equivalent to <tt>self.to_s.end_with?</tt>; see String#end_with?.
11812 *
11813 */
11814
11815static VALUE
11816sym_end_with(int argc, VALUE *argv, VALUE sym)
11817{
11818 return rb_str_end_with(argc, argv, rb_sym2str(sym));
11819}
11820
11821/*
11822 * call-seq:
11823 * encoding -> encoding
11824 *
11825 * Equivalent to <tt>self.to_s.encoding</tt>; see String#encoding.
11826 *
11827 */
11828
11829static VALUE
11830sym_encoding(VALUE sym)
11831{
11832 return rb_obj_encoding(rb_sym2str(sym));
11833}
11834
11835static VALUE
11836string_for_symbol(VALUE name)
11837{
11838 if (!RB_TYPE_P(name, T_STRING)) {
11839 VALUE tmp = rb_check_string_type(name);
11840 if (NIL_P(tmp)) {
11841 rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol",
11842 name);
11843 }
11844 name = tmp;
11845 }
11846 return name;
11847}
11848
11849ID
11851{
11852 if (SYMBOL_P(name)) {
11853 return SYM2ID(name);
11854 }
11855 name = string_for_symbol(name);
11856 return rb_intern_str(name);
11857}
11858
11859VALUE
11861{
11862 if (SYMBOL_P(name)) {
11863 return name;
11864 }
11865 name = string_for_symbol(name);
11866 return rb_str_intern(name);
11867}
11868
11869/*
11870 * call-seq:
11871 * Symbol.all_symbols -> array_of_symbols
11872 *
11873 * Returns an array of all symbols currently in Ruby's symbol table:
11874 *
11875 * Symbol.all_symbols.size # => 9334
11876 * Symbol.all_symbols.take(3) # => [:!, :"\"", :"#"]
11877 *
11878 */
11879
11880static VALUE
11881sym_all_symbols(VALUE _)
11882{
11883 return rb_sym_all_symbols();
11884}
11885
11886VALUE
11888{
11889 return rb_fstring(str);
11890}
11891
11892VALUE
11893rb_interned_str(const char *ptr, long len)
11894{
11895 struct RString fake_str;
11896 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), TRUE);
11897}
11898
11899VALUE
11901{
11902 return rb_interned_str(ptr, strlen(ptr));
11903}
11904
11905VALUE
11906rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
11907{
11908 if (UNLIKELY(rb_enc_autoload_p(enc))) {
11909 rb_enc_autoload(enc);
11910 }
11911
11912 struct RString fake_str;
11913 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), TRUE);
11914}
11915
11916VALUE
11918{
11919 return rb_enc_interned_str(ptr, strlen(ptr), enc);
11920}
11921
11922void
11923Init_String(void)
11924{
11926 assert(rb_vm_fstring_table());
11927 st_foreach(rb_vm_fstring_table(), fstring_set_class_i, rb_cString);
11929 rb_define_alloc_func(rb_cString, empty_str_alloc);
11930 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
11931 rb_define_method(rb_cString, "initialize", rb_str_init, -1);
11932 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
11933 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
11934 rb_define_method(rb_cString, "==", rb_str_equal, 1);
11935 rb_define_method(rb_cString, "===", rb_str_equal, 1);
11936 rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
11937 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
11938 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
11939 rb_define_method(rb_cString, "casecmp?", rb_str_casecmp_p, 1);
11940 rb_define_method(rb_cString, "+", rb_str_plus, 1);
11941 rb_define_method(rb_cString, "*", rb_str_times, 1);
11942 rb_define_method(rb_cString, "%", rb_str_format_m, 1);
11943 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
11944 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
11945 rb_define_method(rb_cString, "insert", rb_str_insert, 2);
11946 rb_define_method(rb_cString, "length", rb_str_length, 0);
11947 rb_define_method(rb_cString, "size", rb_str_length, 0);
11948 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
11949 rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
11950 rb_define_method(rb_cString, "=~", rb_str_match, 1);
11951 rb_define_method(rb_cString, "match", rb_str_match_m, -1);
11952 rb_define_method(rb_cString, "match?", rb_str_match_m_p, -1);
11953 rb_define_method(rb_cString, "succ", rb_str_succ, 0);
11954 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
11955 rb_define_method(rb_cString, "next", rb_str_succ, 0);
11956 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
11957 rb_define_method(rb_cString, "upto", rb_str_upto, -1);
11958 rb_define_method(rb_cString, "index", rb_str_index_m, -1);
11959 rb_define_method(rb_cString, "byteindex", rb_str_byteindex_m, -1);
11960 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
11961 rb_define_method(rb_cString, "byterindex", rb_str_byterindex_m, -1);
11962 rb_define_method(rb_cString, "replace", rb_str_replace, 1);
11963 rb_define_method(rb_cString, "clear", rb_str_clear, 0);
11964 rb_define_method(rb_cString, "chr", rb_str_chr, 0);
11965 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
11966 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
11967 rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
11968 rb_define_method(rb_cString, "bytesplice", rb_str_bytesplice, -1);
11969 rb_define_method(rb_cString, "scrub", str_scrub, -1);
11970 rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
11971 rb_define_method(rb_cString, "freeze", rb_str_freeze, 0);
11972 rb_define_method(rb_cString, "+@", str_uplus, 0);
11973 rb_define_method(rb_cString, "-@", str_uminus, 0);
11974 rb_define_alias(rb_cString, "dedup", "-@");
11975
11976 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
11977 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
11978 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
11979 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
11980 rb_define_method(rb_cString, "inspect", rb_str_inspect, 0);
11981 rb_define_method(rb_cString, "dump", rb_str_dump, 0);
11982 rb_define_method(rb_cString, "undump", str_undump, 0);
11983
11984 sym_ascii = ID2SYM(rb_intern_const("ascii"));
11985 sym_turkic = ID2SYM(rb_intern_const("turkic"));
11986 sym_lithuanian = ID2SYM(rb_intern_const("lithuanian"));
11987 sym_fold = ID2SYM(rb_intern_const("fold"));
11988
11989 rb_define_method(rb_cString, "upcase", rb_str_upcase, -1);
11990 rb_define_method(rb_cString, "downcase", rb_str_downcase, -1);
11991 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, -1);
11992 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, -1);
11993
11994 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, -1);
11995 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, -1);
11996 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, -1);
11997 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, -1);
11998
11999 rb_define_method(rb_cString, "hex", rb_str_hex, 0);
12000 rb_define_method(rb_cString, "oct", rb_str_oct, 0);
12001 rb_define_method(rb_cString, "split", rb_str_split_m, -1);
12002 rb_define_method(rb_cString, "lines", rb_str_lines, -1);
12003 rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
12004 rb_define_method(rb_cString, "chars", rb_str_chars, 0);
12005 rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
12006 rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0);
12007 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
12008 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
12009 rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
12010 rb_define_method(rb_cString, "<<", rb_str_concat, 1);
12011 rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
12012 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
12013 rb_define_method(rb_cString, "intern", rb_str_intern, 0); /* in symbol.c */
12014 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); /* in symbol.c */
12015 rb_define_method(rb_cString, "ord", rb_str_ord, 0);
12016
12017 rb_define_method(rb_cString, "include?", rb_str_include, 1);
12018 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
12019 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
12020
12021 rb_define_method(rb_cString, "scan", rb_str_scan, 1);
12022
12023 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
12024 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
12025 rb_define_method(rb_cString, "center", rb_str_center, -1);
12026
12027 rb_define_method(rb_cString, "sub", rb_str_sub, -1);
12028 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
12029 rb_define_method(rb_cString, "chop", rb_str_chop, 0);
12030 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
12031 rb_define_method(rb_cString, "strip", rb_str_strip, 0);
12032 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
12033 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
12034 rb_define_method(rb_cString, "delete_prefix", rb_str_delete_prefix, 1);
12035 rb_define_method(rb_cString, "delete_suffix", rb_str_delete_suffix, 1);
12036
12037 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
12038 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
12039 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
12040 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
12041 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
12042 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
12043 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
12044 rb_define_method(rb_cString, "delete_prefix!", rb_str_delete_prefix_bang, 1);
12045 rb_define_method(rb_cString, "delete_suffix!", rb_str_delete_suffix_bang, 1);
12046
12047 rb_define_method(rb_cString, "tr", rb_str_tr, 2);
12048 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
12049 rb_define_method(rb_cString, "delete", rb_str_delete, -1);
12050 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
12051 rb_define_method(rb_cString, "count", rb_str_count, -1);
12052
12053 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
12054 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
12055 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
12056 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
12057
12058 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
12059 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
12060 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
12061 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
12062 rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0);
12063
12064 rb_define_method(rb_cString, "sum", rb_str_sum, -1);
12065
12066 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
12067 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
12068
12069 rb_define_method(rb_cString, "partition", rb_str_partition, 1);
12070 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
12071
12072 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
12073 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
12074 rb_define_method(rb_cString, "b", rb_str_b, 0);
12075 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
12076 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
12077
12078 /* define UnicodeNormalize module here so that we don't have to look it up */
12079 mUnicodeNormalize = rb_define_module("UnicodeNormalize");
12080 id_normalize = rb_intern_const("normalize");
12081 id_normalized_p = rb_intern_const("normalized?");
12082
12083 rb_define_method(rb_cString, "unicode_normalize", rb_str_unicode_normalize, -1);
12084 rb_define_method(rb_cString, "unicode_normalize!", rb_str_unicode_normalize_bang, -1);
12085 rb_define_method(rb_cString, "unicode_normalized?", rb_str_unicode_normalized_p, -1);
12086
12087 rb_fs = Qnil;
12088 rb_define_hooked_variable("$;", &rb_fs, 0, rb_fs_setter);
12089 rb_define_hooked_variable("$-F", &rb_fs, 0, rb_fs_setter);
12090 rb_gc_register_address(&rb_fs);
12091
12096 rb_define_singleton_method(rb_cSymbol, "all_symbols", sym_all_symbols, 0);
12097
12098 rb_define_method(rb_cSymbol, "==", sym_equal, 1);
12099 rb_define_method(rb_cSymbol, "===", sym_equal, 1);
12100 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
12101 rb_define_method(rb_cSymbol, "to_s", rb_sym_to_s, 0);
12102 rb_define_method(rb_cSymbol, "id2name", rb_sym_to_s, 0);
12103 rb_define_method(rb_cSymbol, "name", rb_sym2str, 0); /* in symbol.c */
12104 rb_define_method(rb_cSymbol, "to_proc", rb_sym_to_proc, 0); /* in proc.c */
12105 rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
12106 rb_define_method(rb_cSymbol, "next", sym_succ, 0);
12107
12108 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
12109 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
12110 rb_define_method(rb_cSymbol, "casecmp?", sym_casecmp_p, 1);
12111 rb_define_method(rb_cSymbol, "=~", sym_match, 1);
12112
12113 rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
12114 rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
12115 rb_define_method(rb_cSymbol, "length", sym_length, 0);
12116 rb_define_method(rb_cSymbol, "size", sym_length, 0);
12117 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
12118 rb_define_method(rb_cSymbol, "match", sym_match_m, -1);
12119 rb_define_method(rb_cSymbol, "match?", sym_match_m_p, -1);
12120
12121 rb_define_method(rb_cSymbol, "upcase", sym_upcase, -1);
12122 rb_define_method(rb_cSymbol, "downcase", sym_downcase, -1);
12123 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, -1);
12124 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, -1);
12125
12126 rb_define_method(rb_cSymbol, "start_with?", sym_start_with, -1);
12127 rb_define_method(rb_cSymbol, "end_with?", sym_end_with, -1);
12128
12129 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
12130}
#define RUBY_ASSERT(expr)
Asserts that the given expression is truthy if and only if RUBY_DEBUG is truthy.
Definition assert.h:177
#define RUBY_ASSERT_ALWAYS(expr)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
Definition assert.h:167
Atomic operations.
static int rb_isspace(int c)
Our own locale-insensitive version of isspace(3).
Definition ctype.h:395
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
Definition sprintf.c:1200
@ RUBY_FL_FREEZE
This flag has something to do with data immutability.
Definition fl_type.h:356
void rb_include_module(VALUE klass, VALUE module)
Includes a module to a class.
Definition class.c:1090
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
Definition class.c:888
VALUE rb_define_module(const char *name)
Defines a top-level module.
Definition class.c:998
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
Definition class.c:2249
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
Definition class.c:2073
int rb_block_given_p(void)
Determines if the current method is given a block.
Definition eval.c:864
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
Definition class.c:2328
#define TYPE(_)
Old name of rb_type.
Definition value_type.h:107
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
Definition encoding.h:105
#define RB_INTEGER_TYPE_P
Old name of rb_integer_type_p.
Definition value_type.h:87
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
Definition coderange.h:180
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
Definition coderange.h:181
#define FL_UNSET_RAW
Old name of RB_FL_UNSET_RAW.
Definition fl_type.h:142
#define rb_str_buf_cat2
Old name of rb_usascii_str_new_cstr.
Definition string.h:1682
#define FL_EXIVAR
Old name of RUBY_FL_EXIVAR.
Definition fl_type.h:67
#define ALLOCV
Old name of RB_ALLOCV.
Definition memory.h:398
#define ISSPACE
Old name of rb_isspace.
Definition ctype.h:88
#define T_STRING
Old name of RUBY_T_STRING.
Definition value_type.h:78
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
Definition coderange.h:183
#define ENC_CODERANGE_AND(a, b)
Old name of RB_ENC_CODERANGE_AND.
Definition coderange.h:188
#define xfree
Old name of ruby_xfree.
Definition xmalloc.h:58
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
Definition long.h:48
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
Definition fl_type.h:145
#define rb_str_cat2
Old name of rb_str_cat_cstr.
Definition string.h:1683
#define UNREACHABLE
Old name of RBIMPL_UNREACHABLE.
Definition assume.h:28
#define ID2SYM
Old name of RB_ID2SYM.
Definition symbol.h:44
#define OBJ_FREEZE_RAW
Old name of RB_OBJ_FREEZE_RAW.
Definition fl_type.h:144
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
Definition fl_type.h:143
#define T_FIXNUM
Old name of RUBY_T_FIXNUM.
Definition value_type.h:63
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
Definition assume.h:29
#define SYM2ID
Old name of RB_SYM2ID.
Definition symbol.h:45
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
Definition coderange.h:184
#define CLASS_OF
Old name of rb_class_of.
Definition globals.h:203
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
Definition coderange.h:179
#define SIZET2NUM
Old name of RB_SIZE2NUM.
Definition size_t.h:62
#define FIXABLE
Old name of RB_FIXABLE.
Definition fixnum.h:25
#define xmalloc
Old name of ruby_xmalloc.
Definition xmalloc.h:53
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
Definition encoding.h:108
#define LONG2FIX
Old name of RB_INT2FIX.
Definition long.h:49
#define ISDIGIT
Old name of rb_isdigit.
Definition ctype.h:93
#define ENC_CODERANGE_MASK
Old name of RUBY_ENC_CODERANGE_MASK.
Definition coderange.h:178
#define ZALLOC_N
Old name of RB_ZALLOC_N.
Definition memory.h:395
#define ALLOC_N
Old name of RB_ALLOC_N.
Definition memory.h:393
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
Definition encoding.h:533
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
Definition fl_type.h:140
#define FL_SET
Old name of RB_FL_SET.
Definition fl_type.h:137
#define rb_ary_new3
Old name of rb_ary_new_from_args.
Definition array.h:652
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
Definition encoding.h:66
#define LONG2NUM
Old name of RB_LONG2NUM.
Definition long.h:50
#define ISALPHA
Old name of rb_isalpha.
Definition ctype.h:92
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
Definition encoding.h:534
#define ISASCII
Old name of rb_isascii.
Definition ctype.h:85
#define TOLOWER
Old name of rb_tolower.
Definition ctype.h:101
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
Definition st_data_t.h:33
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
Definition encoding.h:535
#define FIXNUM_MAX
Old name of RUBY_FIXNUM_MAX.
Definition fixnum.h:26
#define NUM2INT
Old name of RB_NUM2INT.
Definition int.h:44
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define FIX2LONG
Old name of RB_FIX2LONG.
Definition long.h:46
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
Definition coderange.h:182
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
Definition util.h:97
#define NIL_P
Old name of RB_NIL_P.
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
Definition encoding.h:532
#define FL_WB_PROTECTED
Old name of RUBY_FL_WB_PROTECTED.
Definition fl_type.h:59
#define DBL2NUM
Old name of rb_float_new.
Definition double.h:29
#define ISPRINT
Old name of rb_isprint.
Definition ctype.h:86
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
Definition value_type.h:85
#define ENCODING_SHIFT
Old name of RUBY_ENCODING_SHIFT.
Definition encoding.h:67
#define FL_TEST
Old name of RB_FL_TEST.
Definition fl_type.h:139
#define FL_FREEZE
Old name of RUBY_FL_FREEZE.
Definition fl_type.h:68
#define NUM2LONG
Old name of RB_NUM2LONG.
Definition long.h:51
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
Definition encoding.h:107
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
Definition coderange.h:187
#define FL_UNSET
Old name of RB_FL_UNSET.
Definition fl_type.h:141
#define UINT2NUM
Old name of RB_UINT2NUM.
Definition int.h:46
#define ENCODING_IS_ASCII8BIT(obj)
Old name of RB_ENCODING_IS_ASCII8BIT.
Definition encoding.h:109
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define CONST_ID
Old name of RUBY_CONST_ID.
Definition symbol.h:47
#define rb_ary_new2
Old name of rb_ary_new_capa.
Definition array.h:651
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
Definition coderange.h:186
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Old name of RB_ENCODING_CODERANGE_SET.
Definition coderange.h:189
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
Definition fl_type.h:138
#define SYMBOL_P
Old name of RB_SYMBOL_P.
Definition value_type.h:88
#define OBJ_FROZEN_RAW
Old name of RB_OBJ_FROZEN_RAW.
Definition fl_type.h:146
#define T_REGEXP
Old name of RUBY_T_REGEXP.
Definition value_type.h:77
#define ENCODING_MASK
Old name of RUBY_ENCODING_MASK.
Definition encoding.h:68
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports always regardless of runtime -W flag.
Definition error.c:421
void rb_raise(VALUE exc, const char *fmt,...)
Exception entry point.
Definition error.c:3148
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
Definition eval.c:684
void rb_syserr_fail(int e, const char *mesg)
Raises appropriate exception that represents a C errno.
Definition error.c:3260
void rb_bug(const char *fmt,...)
Interpreter panic switch.
Definition error.c:794
VALUE rb_eRangeError
RangeError exception.
Definition error.c:1095
VALUE rb_eTypeError
TypeError exception.
Definition error.c:1091
void rb_fatal(const char *fmt,...)
Raises the unsung "fatal" exception.
Definition error.c:3199
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
Definition error.c:1098
VALUE rb_eRuntimeError
RuntimeError exception.
Definition error.c:1089
VALUE rb_eArgError
ArgumentError exception.
Definition error.c:1092
VALUE rb_eIndexError
IndexError exception.
Definition error.c:1093
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
Definition error.h:48
VALUE rb_cObject
Documented in include/ruby/internal/globals.h.
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
Definition object.c:589
VALUE rb_obj_alloc(VALUE klass)
Allocates an instance of the given class.
Definition object.c:1939
VALUE rb_obj_frozen_p(VALUE obj)
Just calls RB_OBJ_FROZEN() inside.
Definition object.c:1194
double rb_str_to_dbl(VALUE str, int mode)
Identical to rb_cstr_to_dbl(), except it accepts a Ruby's string instead of C's.
Definition object.c:3412
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
Definition object.c:190
VALUE rb_cSymbol
Sumbol class.
Definition string.c:80
VALUE rb_equal(VALUE lhs, VALUE rhs)
This function is an optimised version of calling #==.
Definition object.c:122
VALUE rb_obj_freeze(VALUE obj)
Just calls rb_obj_freeze_inline() inside.
Definition object.c:1182
VALUE rb_mComparable
Comparable module.
Definition compar.c:19
VALUE rb_cString
String class.
Definition string.c:79
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
Definition object.c:3022
#define RB_OBJ_WRITE(old, slot, young)
Declaration of a "back" pointer.
Definition rgengc.h:220
Encoding relates APIs.
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
Definition string.c:1208
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
Definition string.c:821
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it takes a C string literal.
Definition string.c:1074
char * rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc)
Queries the n-th character.
Definition string.c:2716
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
Definition string.c:1093
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it returns a "f"string.
Definition string.c:11906
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
Definition re.c:249
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
Definition string.c:2060
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_cat(), except it additionally takes an encoding.
Definition string.c:3261
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc)
Identical to rb_str_export(), except it additionally takes an encoding.
Definition string.c:1313
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Identical to rb_external_str_new(), except it additionally takes an encoding.
Definition string.c:1214
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
Definition string.c:833
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
Definition string.c:11917
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
Definition string.c:719
int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc)
Identical to rb_enc_symname_p(), except it additionally takes the passed string's length.
Definition symbol.c:407
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
Definition transcode.c:1453
rb_econv_result_t
return value of rb_econv_convert()
Definition transcode.h:30
@ econv_finished
The conversion stopped after converting everything.
Definition transcode.h:57
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
Definition transcode.h:46
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
Definition transcode.c:2630
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
Definition transcode.c:2884
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
Definition transcode.c:1709
VALUE rb_funcall_with_block_kw(VALUE recv, ID mid, int argc, const VALUE *argv, VALUE procval, int kw_splat)
Identical to rb_funcallv_with_block(), except you can specify how to handle the last element of the g...
Definition vm_eval.c:1190
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
This roughly resembles return enum_for(__callee__) unless block_given?.
Definition enumerator.h:206
#define RETURN_ENUMERATOR(obj, argc, argv)
Identical to RETURN_SIZED_ENUMERATOR(), except its size is unknown.
Definition enumerator.h:239
#define UNLIMITED_ARGUMENTS
This macro is used in conjunction with rb_check_arity().
Definition error.h:35
#define rb_check_frozen
Just another name of rb_check_frozen.
Definition error.h:264
VALUE rb_fs
The field separator character for inputs, or the $;.
Definition string.c:604
VALUE rb_default_rs
This is the default value of rb_rs, i.e.
Definition io.c:200
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
Definition vm.c:1662
VALUE rb_sym_all_symbols(void)
Collects every single bits of symbols that have ever interned in the entire history of the current pr...
Definition symbol.c:1009
void rb_backref_set(VALUE md)
Updates $~.
Definition vm.c:1668
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
Definition range.c:1578
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
Definition re.c:1229
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
Definition re.c:4107
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
Definition re.c:3590
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
Definition re.c:1435
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
Definition re.c:1861
VALUE rb_str_to_interned_str(VALUE str)
Identical to rb_interned_str(), except it takes a Ruby's string instead of C's.
Definition string.c:11887
void rb_str_free(VALUE str)
Destroys the given string for no reason.
Definition string.c:1571
VALUE rb_str_new_shared(VALUE str)
Identical to rb_str_new_cstr(), except it takes a Ruby's string instead of C's.
Definition string.c:1376
VALUE rb_str_plus(VALUE lhs, VALUE rhs)
Generates a new string, concatenating the former to the latter.
Definition string.c:2211
#define rb_utf8_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "UTF-8" encoding.
Definition string.h:1583
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
Definition string.c:3324
VALUE rb_filesystem_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "filesystem" encoding.
Definition string.c:1289
VALUE rb_sym_to_s(VALUE sym)
This is an rb_sym2str() + rb_str_dup() combo.
Definition string.c:11533
VALUE rb_str_times(VALUE str, VALUE num)
Repetition of a string.
Definition string.c:2283
VALUE rb_external_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "default external" encoding.
Definition string.c:1265
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
Definition string.c:1565
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
Definition string.c:2744
VALUE rb_str_succ(VALUE orig)
Searches for the "successor" of a string.
Definition string.c:4794
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Compares two strings.
Definition string.c:3548
VALUE rb_str_subseq(VALUE str, long beg, long len)
Identical to rb_str_substr(), except the numbers are interpreted as byte offsets instead of character...
Definition string.c:2826
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
Definition string.c:10826
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
Definition random.c:1741
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
Definition string.h:1498
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
Definition string.c:1618
#define rb_str_buf_cat
Just another name of rb_str_cat.
Definition string.h:1681
VALUE rb_str_new_static(const char *ptr, long len)
Identical to rb_str_new(), except it takes a C string literal.
Definition string.c:1056
#define rb_usascii_str_new(str, len)
Identical to rb_str_new, except it generates a string of "US ASCII" encoding.
Definition string.h:1532
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
Definition string.c:871
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
Definition string.c:1382
VALUE rb_str_dup(VALUE str)
Duplicates a string.
Definition string.c:1834
void rb_str_modify(VALUE str)
Declares that the string is about to be modified.
Definition string.c:2437
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
Definition string.c:3538
VALUE rb_str_cat(VALUE dst, const char *src, long srclen)
Destructively appends the passed contents to the string.
Definition string.c:3150
VALUE rb_str_locktmp(VALUE str)
Obtains a "temporary lock" of the string.
long rb_str_strlen(VALUE str)
Counts the number of characters (not bytes) that are stored inside of the given string.
Definition string.c:2149
VALUE rb_str_resurrect(VALUE str)
I guess there is no use case of this function in extension libraries, but this is a routine identical...
Definition string.c:1840
#define rb_str_buf_new_cstr(str)
Identical to rb_str_new_cstr, except done differently.
Definition string.h:1639
#define rb_usascii_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "US ASCII" encoding.
Definition string.h:1567
VALUE rb_str_replace(VALUE dst, VALUE src)
Replaces the contents of the former object with the stringised contents of the latter.
Definition string.c:5996
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
Definition string.c:2834
rb_gvar_setter_t rb_str_setter
This is a rb_gvar_setter_t that refutes non-string assignments.
Definition string.h:1146
VALUE rb_interned_str_cstr(const char *ptr)
Identical to rb_interned_str(), except it assumes the passed pointer is a pointer to a C's string.
Definition string.c:11900
VALUE rb_filesystem_str_new_cstr(const char *ptr)
Identical to rb_filesystem_str_new(), except it assumes the passed pointer is a pointer to a C string...
Definition string.c:1295
#define rb_external_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "default external" encoding.
Definition string.h:1604
VALUE rb_str_buf_append(VALUE dst, VALUE src)
Identical to rb_str_cat_cstr(), except it takes Ruby's string instead of C's.
Definition string.c:3292
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
Definition string.c:2791
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
Definition string.c:3650
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
Definition string.c:3020
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
Definition string.c:6678
void rb_must_asciicompat(VALUE obj)
Asserts that the given string's encoding is (Ruby's definition of) ASCII compatible.
Definition string.c:2489
VALUE rb_interned_str(const char *ptr, long len)
Identical to rb_str_new(), except it returns an infamous "f"string.
Definition string.c:11893
int rb_str_cmp(VALUE lhs, VALUE rhs)
Compares two strings, as in strcmp(3).
Definition string.c:3604
VALUE rb_str_concat(VALUE dst, VALUE src)
Identical to rb_str_append(), except it also accepts an integer as a codepoint.
Definition string.c:3424
int rb_str_comparable(VALUE str1, VALUE str2)
Checks if two strings are comparable each other or not.
Definition string.c:3579
#define rb_strlen_lit(str)
Length of a string literal.
Definition string.h:1692
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
Definition string.c:3268
VALUE rb_str_freeze(VALUE str)
This is the implementation of String#freeze.
Definition string.c:2942
void rb_str_update(VALUE dst, long beg, long len, VALUE src)
Replaces some (or all) of the contents of the given string.
Definition string.c:5298
VALUE rb_str_scrub(VALUE str, VALUE repl)
"Cleanses" the string.
Definition string.c:10884
#define rb_locale_str_new_cstr(str)
Identical to rb_external_str_new_cstr, except it generates a string of "locale" encoding instead of "...
Definition string.h:1625
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Identical to rb_str_new(), except it takes the class of the allocating object.
Definition string.c:1513
#define rb_str_dup_frozen
Just another name of rb_str_new_frozen.
Definition string.h:631
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
Definition string.c:2640
VALUE rb_str_substr(VALUE str, long beg, long len)
This is the implementation of two-argumented String#slice.
Definition string.c:2921
#define rb_str_cat_cstr(buf, str)
Identical to rb_str_cat(), except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1656
VALUE rb_str_unlocktmp(VALUE str)
Releases a lock formerly obtained by rb_str_locktmp().
Definition string.c:3003
VALUE rb_str_resize(VALUE str, long len)
Overwrites the length of the string.
Definition string.c:3037
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "UTF-8" encoding instead of "binary...
Definition string.c:1068
#define rb_utf8_str_new(str, len)
Identical to rb_str_new, except it generates a string of "UTF-8" encoding.
Definition string.h:1549
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
Definition string.c:2445
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
Definition string.c:6792
VALUE rb_locale_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "locale" encoding.
Definition string.c:1277
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
Definition string.c:1532
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
Definition string.c:2163
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1514
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
Definition string.c:5224
VALUE rb_str_split(VALUE str, const char *delim)
Divides the given string based on the given delimiter.
Definition string.c:8862
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "US ASCII" encoding instead of "bin...
Definition string.c:1062
VALUE rb_str_intern(VALUE str)
Identical to rb_to_symbol(), except it assumes the receiver being an instance of RString.
Definition symbol.c:844
VALUE rb_obj_as_string(VALUE obj)
Try converting an object to its stringised representation using its to_s method, if any.
Definition string.c:1682
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
Definition vm_method.c:2823
void rb_undef_alloc_func(VALUE klass)
Deletes the allocator function of a class.
Definition vm_method.c:1159
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
VALUE rb_sym2str(VALUE id)
Identical to rb_id2str(), except it takes an instance of rb_cSymbol rather than an ID.
Definition symbol.c:942
VALUE rb_to_symbol(VALUE name)
Identical to rb_intern_str(), except it generates a dynamic symbol if necessary.
Definition string.c:11860
ID rb_to_id(VALUE str)
Identical to rb_intern(), except it takes an instance of rb_cString.
Definition string.c:11850
ID rb_intern_str(VALUE str)
Identical to rb_intern(), except it takes an instance of rb_cString.
Definition symbol.c:795
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
Definition re.c:1765
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
Definition re.c:3369
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
Definition re.c:4351
VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt)
Formats a string.
Definition sprintf.c:214
VALUE rb_yield(VALUE val)
Yields the block.
Definition vm_eval.c:1358
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
Definition memory.h:366
#define ALLOCA_N(type, n)
Definition memory.h:286
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
Definition memory.h:354
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
Definition memory.h:161
void rb_define_hooked_variable(const char *q, VALUE *w, type *e, void_type *r)
Define a function-backended global variable.
VALUE rb_ensure(type *q, VALUE w, type *e, VALUE r)
An equivalent of ensure clause.
#define RARRAY_CONST_PTR
Just another name of rb_array_const_ptr.
Definition rarray.h:69
#define RBASIC(obj)
Convenient casting macro.
Definition rbasic.h:40
#define DATA_PTR(obj)
Convenient getter macro.
Definition rdata.h:71
#define RGENGC_WB_PROTECTED_STRING
This is a compile-time flag to enable/disable write barrier for struct RString.
Definition rgengc.h:107
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
Definition rmatch.h:139
@ RSTRING_EMBED_LEN_MAX
Max possible number of characters that can be embedded.
Definition rstring.h:215
#define StringValue(v)
Ensures that the parameter object is a String.
Definition rstring.h:72
VALUE rb_str_export_locale(VALUE obj)
Identical to rb_str_export(), except it converts into the locale encoding instead.
Definition string.c:1307
char * rb_string_value_cstr(volatile VALUE *ptr)
Identical to rb_string_value_ptr(), except it additionally checks for the contents for viability as a...
Definition string.c:2617
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
Definition rstring.h:574
VALUE rb_string_value(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it fills the passed pointer with the converted object.
Definition string.c:2501
#define RSTRING(obj)
Convenient casting macro.
Definition rstring.h:41
VALUE rb_str_export(VALUE obj)
Identical to rb_str_to_str(), except it additionally converts the string into default external encodi...
Definition string.c:1301
char * rb_string_value_ptr(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it returns the converted string's backend memory region.
Definition string.c:2512
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
Definition string.c:1609
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
Definition rstring.h:95
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
Definition rtypeddata.h:441
VALUE rb_require(const char *feature)
Identical to rb_require_string(), except it takes C's string instead of Ruby's.
Definition load.c:1307
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
Definition stdarg.h:35
VALUE flags
Per-object flags.
Definition rbasic.h:77
Ruby's String.
Definition rstring.h:231
union RString::@50 as
String's specific fields.
struct RString::@50::@51 heap
Strings that use separated memory region for contents use this pattern.
struct RBasic basic
Basic part, including flags and class.
Definition rstring.h:234
long capa
Capacity of *ptr.
Definition rstring.h:268
struct RString::@50::@52 embed
Embedded contents.
char ary[RSTRING_EMBED_LEN_MAX+1]
When a string is short enough, it uses this area to store the contents themselves.
Definition rstring.h:298
long len
Length of the string, not including terminating NUL character.
Definition rstring.h:250
union RString::@50::@51::@53 aux
Auxiliary info.
VALUE shared
Parent of the string.
Definition rstring.h:276
char * ptr
Pointer to the contents of the string.
Definition rstring.h:258
Definition st.h:79
Definition string.c:7747
void rb_nativethread_lock_lock(rb_nativethread_lock_t *lock)
Blocks until the current thread obtains a lock.
Definition thread.c:299
uintptr_t VALUE
Type that represents a Ruby object.
Definition value.h:40
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
Definition value.h:52