monotone

monotone Mtn Source Tree

Root/simplestring_xform.cc

1#include "base.hh"
2#include "simplestring_xform.hh"
3#include "sanity.hh"
4#include "constants.hh"
5
6#include <set>
7#include <sstream>
8#include <iterator>
9
10using std::set;
11using std::string;
12using std::vector;
13using std::ostringstream;
14using std::ostream_iterator;
15using std::transform;
16
17struct
18lowerize
19{
20 char operator()(char const & c) const
21 {
22 return ::tolower(static_cast<int>(c));
23 }
24};
25
26string
27lowercase(string const & in)
28{
29 string n(in);
30 transform(n.begin(), n.end(), n.begin(), lowerize());
31 return n;
32}
33
34struct
35upperize
36{
37 char operator()(char const & c) const
38 {
39 return ::toupper(static_cast<int>(c));
40 }
41};
42
43string
44uppercase(string const & in)
45{
46 string n(in);
47 transform(n.begin(), n.end(), n.begin(), upperize());
48 return n;
49}
50
51void split_into_lines(string const & in,
52 string const & encoding,
53 vector<string> & out)
54{
55 string lc_encoding = lowercase(encoding);
56 out.clear();
57
58 // note: this function does not handle ISO-2022-X, Shift-JIS, and
59 // probably a good deal of other encodings as well. please expand
60 // the logic here if you can work out an easy way of doing line
61 // breaking on these encodings. currently it's just designed to
62 // work with charsets in which 0x0a / 0x0d are *always* \n and \r
63 // respectively.
64 //
65 // as far as I know, this covers the EUC, ISO-8859-X, GB, Big5, KOI,
66 // ASCII, and UTF-8 families of encodings.
67
68 if (lc_encoding == constants::default_encoding
69 || lc_encoding.find("ascii") != string::npos
70 || lc_encoding.find("8859") != string::npos
71 || lc_encoding.find("euc") != string::npos
72 || lc_encoding.find("koi") != string::npos
73 || lc_encoding.find("gb") != string::npos
74 || lc_encoding == "utf-8"
75 || lc_encoding == "utf_8"
76 || lc_encoding == "utf8")
77 {
78 string::size_type begin = 0;
79 string::size_type end = in.find_first_of("\r\n", begin);
80
81 while (end != string::npos && end >= begin)
82 {
83 out.push_back(in.substr(begin, end-begin));
84 if (in.at(end) == '\r'
85 && in.size() > end+1
86 && in.at(end+1) == '\n')
87 begin = end + 2;
88 else
89 begin = end + 1;
90 if (begin >= in.size())
91 break;
92 end = in.find_first_of("\r\n", begin);
93 }
94 if (begin < in.size())
95 out.push_back(in.substr(begin, in.size() - begin));
96 }
97 else
98 {
99 out.push_back(in);
100 }
101}
102
103
104void
105split_into_lines(string const & in,
106 vector<string> & out)
107{
108 split_into_lines(in, constants::default_encoding, out);
109}
110
111void
112join_lines(vector<string> const & in,
113 string & out,
114 string const & linesep)
115{
116 ostringstream oss;
117 copy(in.begin(), in.end(), ostream_iterator<string>(oss, linesep.c_str()));
118 out = oss.str();
119}
120
121void
122join_lines(vector<string> const & in,
123 string & out)
124{
125 join_lines(in, out, "\n");
126}
127
128void
129prefix_lines_with(string const & prefix, string const & lines, string & out)
130{
131 vector<string> msgs;
132 split_into_lines(lines, msgs);
133
134 ostringstream oss;
135 for (vector<string>::const_iterator i = msgs.begin();
136 i != msgs.end();)
137 {
138 oss << prefix << *i;
139 i++;
140 if (i != msgs.end())
141 oss << '\n';
142 }
143
144 out = oss.str();
145}
146
147void
148append_without_ws(string & appendto, string const & s)
149{
150 unsigned pos = appendto.size();
151 appendto.resize(pos + s.size());
152 for (string::const_iterator i = s.begin();
153 i != s.end(); ++i)
154 {
155 switch (*i)
156 {
157 case '\n':
158 case '\r':
159 case '\t':
160 case ' ':
161 break;
162 default:
163 appendto[pos] = *i;
164 ++pos;
165 break;
166 }
167 }
168 appendto.resize(pos);
169}
170
171string
172remove_ws(string const & s)
173{
174 string tmp;
175 append_without_ws(tmp, s);
176 return tmp;
177}
178
179string
180trim_ws(string const & s)
181{
182 string tmp = s;
183 string::size_type pos = tmp.find_last_not_of("\n\r\t ");
184 if (pos < string::npos)
185 tmp.erase(++pos);
186 pos = tmp.find_first_not_of("\n\r\t ");
187 if (pos < string::npos)
188 tmp = tmp.substr(pos);
189 return tmp;
190}
191
192#ifdef BUILD_UNIT_TESTS
193#include "unit_tests.hh"
194#include "vocab.hh"
195
196UNIT_TEST(simplestring_xform, caseconv)
197{
198 UNIT_TEST_CHECK(uppercase("hello") == "HELLO");
199 UNIT_TEST_CHECK(uppercase("heLlO") == "HELLO");
200 UNIT_TEST_CHECK(lowercase("POODLE DAY") == "poodle day");
201 UNIT_TEST_CHECK(lowercase("PooDLe DaY") == "poodle day");
202 UNIT_TEST_CHECK(uppercase("!@#$%^&*()") == "!@#$%^&*()");
203 UNIT_TEST_CHECK(lowercase("!@#$%^&*()") == "!@#$%^&*()");
204}
205
206UNIT_TEST(simplestring_xform, join_lines)
207{
208 vector<string> strs;
209 string joined;
210
211 strs.clear();
212 join_lines(strs, joined);
213 UNIT_TEST_CHECK(joined == "");
214
215 strs.push_back("hi");
216 join_lines(strs, joined);
217 UNIT_TEST_CHECK(joined == "hi\n");
218
219 strs.push_back("there");
220 join_lines(strs, joined);
221 UNIT_TEST_CHECK(joined == "hi\nthere\n");
222
223 strs.push_back("user");
224 join_lines(strs, joined);
225 UNIT_TEST_CHECK(joined == "hi\nthere\nuser\n");
226}
227
228UNIT_TEST(simplestring_xform, join_words)
229{
230 vector< utf8 > v;
231 set< utf8 > s;
232
233 v.clear();
234 UNIT_TEST_CHECK(join_words(v)() == "");
235
236 v.clear();
237 v.push_back(utf8("a"));
238 UNIT_TEST_CHECK(join_words(v)() == "a");
239 UNIT_TEST_CHECK(join_words(v, ", ")() == "a");
240
241 s.clear();
242 s.insert(utf8("a"));
243 UNIT_TEST_CHECK(join_words(s)() == "a");
244 UNIT_TEST_CHECK(join_words(s, ", ")() == "a");
245
246 v.clear();
247 v.push_back(utf8("a"));
248 v.push_back(utf8("b"));
249 UNIT_TEST_CHECK(join_words(v)() == "a b");
250 UNIT_TEST_CHECK(join_words(v, ", ")() == "a, b");
251
252 s.clear();
253 s.insert(utf8("b"));
254 s.insert(utf8("a"));
255 UNIT_TEST_CHECK(join_words(s)() == "a b");
256 UNIT_TEST_CHECK(join_words(s, ", ")() == "a, b");
257
258 v.clear();
259 v.push_back(utf8("a"));
260 v.push_back(utf8("b"));
261 v.push_back(utf8("c"));
262 UNIT_TEST_CHECK(join_words(v)() == "a b c");
263 UNIT_TEST_CHECK(join_words(v, ", ")() == "a, b, c");
264
265 s.clear();
266 s.insert(utf8("b"));
267 s.insert(utf8("a"));
268 s.insert(utf8("c"));
269 UNIT_TEST_CHECK(join_words(s)() == "a b c");
270 UNIT_TEST_CHECK(join_words(s, ", ")() == "a, b, c");
271}
272
273UNIT_TEST(simplestring_xform, split_into_words)
274{
275 vector< utf8 > words;
276
277 words = split_into_words(utf8(""));
278 UNIT_TEST_CHECK(words.size() == 0);
279
280 words = split_into_words(utf8("foo"));
281 UNIT_TEST_CHECK(words.size() == 1);
282 UNIT_TEST_CHECK(words[0]() == "foo");
283
284 words = split_into_words(utf8("foo bar"));
285 UNIT_TEST_CHECK(words.size() == 2);
286 UNIT_TEST_CHECK(words[0]() == "foo");
287 UNIT_TEST_CHECK(words[1]() == "bar");
288
289 // describe() in commands.cc assumes this behavior. If it ever changes,
290 // remember to modify that function accordingly!
291 words = split_into_words(utf8("foo bar"));
292 UNIT_TEST_CHECK(words.size() == 3);
293 UNIT_TEST_CHECK(words[0]() == "foo");
294 UNIT_TEST_CHECK(words[1]() == "");
295 UNIT_TEST_CHECK(words[2]() == "bar");
296}
297
298UNIT_TEST(simplestring_xform, strip_ws)
299{
300 UNIT_TEST_CHECK(trim_ws("\n leading space") == "leading space");
301 UNIT_TEST_CHECK(trim_ws("trailing space \n") == "trailing space");
302 UNIT_TEST_CHECK(trim_ws("\t\n both \r \n\r\n") == "both");
303 UNIT_TEST_CHECK(remove_ws(" I like going\tfor walks\n ")
304 == "Ilikegoingforwalks");
305}
306
307#endif // BUILD_UNIT_TESTS
308
309// Local Variables:
310// mode: C++
311// fill-column: 76
312// c-file-style: "gnu"
313// indent-tabs-mode: nil
314// End:
315// vim: et:sw=2:sts=2:ts=2:cino=>2s,{s,\:s,+s,t0,g0,^-2,e-2,n-2,p2s,(0,=s:

Archive Download this file

Branches

Tags

Quick Links:     www.monotone.ca    -     Downloads    -     Documentation    -     Wiki    -     Code Forge    -     Build Status