monotone

monotone Mtn Source Tree

Root/simplestring_xform.cc

1#include "base.hh"
2#include "simplestring_xform.hh"
3#include "sanity.hh"
4#include "constants.hh"
5
6#include <set>
7#include <sstream>
8
9using std::set;
10using std::string;
11using std::vector;
12using std::ostringstream;
13using std::ostream_iterator;
14using std::transform;
15
16struct
17lowerize
18{
19 char operator()(char const & c) const
20 {
21 return ::tolower(static_cast<int>(c));
22 }
23};
24
25string
26lowercase(string const & in)
27{
28 string n(in);
29 transform(n.begin(), n.end(), n.begin(), lowerize());
30 return n;
31}
32
33struct
34upperize
35{
36 char operator()(char const & c) const
37 {
38 return ::toupper(static_cast<int>(c));
39 }
40};
41
42string
43uppercase(string const & in)
44{
45 string n(in);
46 transform(n.begin(), n.end(), n.begin(), upperize());
47 return n;
48}
49
50void split_into_lines(string const & in,
51 string const & encoding,
52 vector<string> & out)
53{
54 string lc_encoding = lowercase(encoding);
55 out.clear();
56
57 // note: this function does not handle ISO-2022-X, Shift-JIS, and
58 // probably a good deal of other encodings as well. please expand
59 // the logic here if you can work out an easy way of doing line
60 // breaking on these encodings. currently it's just designed to
61 // work with charsets in which 0x0a / 0x0d are *always* \n and \r
62 // respectively.
63 //
64 // as far as I know, this covers the EUC, ISO-8859-X, GB, Big5, KOI,
65 // ASCII, and UTF-8 families of encodings.
66
67 if (lc_encoding == constants::default_encoding
68 || lc_encoding.find("ascii") != string::npos
69 || lc_encoding.find("8859") != string::npos
70 || lc_encoding.find("euc") != string::npos
71 || lc_encoding.find("koi") != string::npos
72 || lc_encoding.find("gb") != string::npos
73 || lc_encoding == "utf-8"
74 || lc_encoding == "utf_8"
75 || lc_encoding == "utf8")
76 {
77 string::size_type begin = 0;
78 string::size_type end = in.find_first_of("\r\n", begin);
79
80 while (end != string::npos && end >= begin)
81 {
82 out.push_back(in.substr(begin, end-begin));
83 if (in.at(end) == '\r'
84 && in.size() > end+1
85 && in.at(end+1) == '\n')
86 begin = end + 2;
87 else
88 begin = end + 1;
89 if (begin >= in.size())
90 break;
91 end = in.find_first_of("\r\n", begin);
92 }
93 if (begin < in.size())
94 out.push_back(in.substr(begin, in.size() - begin));
95 }
96 else
97 {
98 out.push_back(in);
99 }
100}
101
102
103void
104split_into_lines(string const & in,
105 vector<string> & out)
106{
107 split_into_lines(in, constants::default_encoding, out);
108}
109
110void
111join_lines(vector<string> const & in,
112 string & out,
113 string const & linesep)
114{
115 ostringstream oss;
116 copy(in.begin(), in.end(), ostream_iterator<string>(oss, linesep.c_str()));
117 out = oss.str();
118}
119
120void
121join_lines(vector<string> const & in,
122 string & out)
123{
124 join_lines(in, out, "\n");
125}
126
127void
128prefix_lines_with(string const & prefix, string const & lines, string & out)
129{
130 vector<string> msgs;
131 split_into_lines(lines, msgs);
132
133 ostringstream oss;
134 for (vector<string>::const_iterator i = msgs.begin();
135 i != msgs.end();)
136 {
137 oss << prefix << *i;
138 i++;
139 if (i != msgs.end())
140 oss << '\n';
141 }
142
143 out = oss.str();
144}
145
146void
147append_without_ws(string & appendto, string const & s)
148{
149 unsigned pos = appendto.size();
150 appendto.resize(pos + s.size());
151 for (string::const_iterator i = s.begin();
152 i != s.end(); ++i)
153 {
154 switch (*i)
155 {
156 case '\n':
157 case '\r':
158 case '\t':
159 case ' ':
160 break;
161 default:
162 appendto[pos] = *i;
163 ++pos;
164 break;
165 }
166 }
167 appendto.resize(pos);
168}
169
170string
171remove_ws(string const & s)
172{
173 string tmp;
174 append_without_ws(tmp, s);
175 return tmp;
176}
177
178string
179trim_ws(string const & s)
180{
181 string tmp = s;
182 string::size_type pos = tmp.find_last_not_of("\n\r\t ");
183 if (pos < string::npos)
184 tmp.erase(++pos);
185 pos = tmp.find_first_not_of("\n\r\t ");
186 if (pos < string::npos)
187 tmp = tmp.substr(pos);
188 return tmp;
189}
190
191#ifdef BUILD_UNIT_TESTS
192#include "unit_tests.hh"
193#include <stdlib.h>
194
195UNIT_TEST(simplestring_xform, caseconv)
196{
197 UNIT_TEST_CHECK(uppercase("hello") == "HELLO");
198 UNIT_TEST_CHECK(uppercase("heLlO") == "HELLO");
199 UNIT_TEST_CHECK(lowercase("POODLE DAY") == "poodle day");
200 UNIT_TEST_CHECK(lowercase("PooDLe DaY") == "poodle day");
201 UNIT_TEST_CHECK(uppercase("!@#$%^&*()") == "!@#$%^&*()");
202 UNIT_TEST_CHECK(lowercase("!@#$%^&*()") == "!@#$%^&*()");
203}
204
205UNIT_TEST(simplestring_xform, join_lines)
206{
207 vector<string> strs;
208 string joined;
209
210 strs.clear();
211 join_lines(strs, joined);
212 UNIT_TEST_CHECK(joined == "");
213
214 strs.push_back("hi");
215 join_lines(strs, joined);
216 UNIT_TEST_CHECK(joined == "hi\n");
217
218 strs.push_back("there");
219 join_lines(strs, joined);
220 UNIT_TEST_CHECK(joined == "hi\nthere\n");
221
222 strs.push_back("user");
223 join_lines(strs, joined);
224 UNIT_TEST_CHECK(joined == "hi\nthere\nuser\n");
225}
226
227UNIT_TEST(simplestring_xform, join_words)
228{
229 vector< utf8 > v;
230 set< utf8 > s;
231
232 v.clear();
233 UNIT_TEST_CHECK(join_words(v)() == "");
234
235 v.clear();
236 v.push_back(utf8("a"));
237 UNIT_TEST_CHECK(join_words(v)() == "a");
238 UNIT_TEST_CHECK(join_words(v, ", ")() == "a");
239
240 s.clear();
241 s.insert(utf8("a"));
242 UNIT_TEST_CHECK(join_words(s)() == "a");
243 UNIT_TEST_CHECK(join_words(s, ", ")() == "a");
244
245 v.clear();
246 v.push_back(utf8("a"));
247 v.push_back(utf8("b"));
248 UNIT_TEST_CHECK(join_words(v)() == "a b");
249 UNIT_TEST_CHECK(join_words(v, ", ")() == "a, b");
250
251 s.clear();
252 s.insert(utf8("b"));
253 s.insert(utf8("a"));
254 UNIT_TEST_CHECK(join_words(s)() == "a b");
255 UNIT_TEST_CHECK(join_words(s, ", ")() == "a, b");
256
257 v.clear();
258 v.push_back(utf8("a"));
259 v.push_back(utf8("b"));
260 v.push_back(utf8("c"));
261 UNIT_TEST_CHECK(join_words(v)() == "a b c");
262 UNIT_TEST_CHECK(join_words(v, ", ")() == "a, b, c");
263
264 s.clear();
265 s.insert(utf8("b"));
266 s.insert(utf8("a"));
267 s.insert(utf8("c"));
268 UNIT_TEST_CHECK(join_words(s)() == "a b c");
269 UNIT_TEST_CHECK(join_words(s, ", ")() == "a, b, c");
270}
271
272UNIT_TEST(simplestring_xform, split_into_words)
273{
274 vector< utf8 > words;
275
276 words = split_into_words(utf8(""));
277 UNIT_TEST_CHECK(words.size() == 0);
278
279 words = split_into_words(utf8("foo"));
280 UNIT_TEST_CHECK(words.size() == 1);
281 UNIT_TEST_CHECK(words[0]() == "foo");
282
283 words = split_into_words(utf8("foo bar"));
284 UNIT_TEST_CHECK(words.size() == 2);
285 UNIT_TEST_CHECK(words[0]() == "foo");
286 UNIT_TEST_CHECK(words[1]() == "bar");
287
288 // describe() in commands.cc assumes this behavior. If it ever changes,
289 // remember to modify that function accordingly!
290 words = split_into_words(utf8("foo bar"));
291 UNIT_TEST_CHECK(words.size() == 3);
292 UNIT_TEST_CHECK(words[0]() == "foo");
293 UNIT_TEST_CHECK(words[1]() == "");
294 UNIT_TEST_CHECK(words[2]() == "bar");
295}
296
297UNIT_TEST(simplestring_xform, strip_ws)
298{
299 UNIT_TEST_CHECK(trim_ws("\n leading space") == "leading space");
300 UNIT_TEST_CHECK(trim_ws("trailing space \n") == "trailing space");
301 UNIT_TEST_CHECK(trim_ws("\t\n both \r \n\r\n") == "both");
302 UNIT_TEST_CHECK(remove_ws(" I like going\tfor walks\n ")
303 == "Ilikegoingforwalks");
304}
305
306#endif // BUILD_UNIT_TESTS
307
308// Local Variables:
309// mode: C++
310// fill-column: 76
311// c-file-style: "gnu"
312// indent-tabs-mode: nil
313// End:
314// vim: et:sw=2:sts=2:ts=2:cino=>2s,{s,\:s,+s,t0,g0,^-2,e-2,n-2,p2s,(0,=s:

Archive Download this file

Branches

Tags

Quick Links:     www.monotone.ca    -     Downloads    -     Documentation    -     Wiki    -     Code Forge    -     Build Status