monotone

monotone Mtn Source Tree

Root/simplestring_xform.cc

1#include "base.hh"
2#include "simplestring_xform.hh"
3#include "sanity.hh"
4#include "constants.hh"
5
6#include <set>
7#include <algorithm>
8#include <sstream>
9#include <iterator>
10
11using std::set;
12using std::string;
13using std::vector;
14using std::ostringstream;
15using std::ostream_iterator;
16using std::transform;
17
18struct
19lowerize
20{
21 char operator()(char const & c) const
22 {
23 return ::tolower(static_cast<int>(c));
24 }
25};
26
27string
28lowercase(string const & in)
29{
30 string n(in);
31 transform(n.begin(), n.end(), n.begin(), lowerize());
32 return n;
33}
34
35struct
36upperize
37{
38 char operator()(char const & c) const
39 {
40 return ::toupper(static_cast<int>(c));
41 }
42};
43
44string
45uppercase(string const & in)
46{
47 string n(in);
48 transform(n.begin(), n.end(), n.begin(), upperize());
49 return n;
50}
51
52void split_into_lines(string const & in,
53 string const & encoding,
54 vector<string> & out)
55{
56 string lc_encoding = lowercase(encoding);
57 out.clear();
58
59 // note: this function does not handle ISO-2022-X, Shift-JIS, and
60 // probably a good deal of other encodings as well. please expand
61 // the logic here if you can work out an easy way of doing line
62 // breaking on these encodings. currently it's just designed to
63 // work with charsets in which 0x0a / 0x0d are *always* \n and \r
64 // respectively.
65 //
66 // as far as I know, this covers the EUC, ISO-8859-X, GB, Big5, KOI,
67 // ASCII, and UTF-8 families of encodings.
68
69 if (lc_encoding == constants::default_encoding
70 || lc_encoding.find("ascii") != string::npos
71 || lc_encoding.find("8859") != string::npos
72 || lc_encoding.find("euc") != string::npos
73 || lc_encoding.find("koi") != string::npos
74 || lc_encoding.find("gb") != string::npos
75 || lc_encoding == "utf-8"
76 || lc_encoding == "utf_8"
77 || lc_encoding == "utf8")
78 {
79 string::size_type begin = 0;
80 string::size_type end = in.find_first_of("\r\n", begin);
81
82 while (end != string::npos && end >= begin)
83 {
84 out.push_back(in.substr(begin, end-begin));
85 if (in.at(end) == '\r'
86 && in.size() > end+1
87 && in.at(end+1) == '\n')
88 begin = end + 2;
89 else
90 begin = end + 1;
91 if (begin >= in.size())
92 break;
93 end = in.find_first_of("\r\n", begin);
94 }
95 if (begin < in.size())
96 out.push_back(in.substr(begin, in.size() - begin));
97 }
98 else
99 {
100 out.push_back(in);
101 }
102}
103
104
105void
106split_into_lines(string const & in,
107 vector<string> & out)
108{
109 split_into_lines(in, constants::default_encoding, out);
110}
111
112void
113join_lines(vector<string> const & in,
114 string & out,
115 string const & linesep)
116{
117 ostringstream oss;
118 copy(in.begin(), in.end(), ostream_iterator<string>(oss, linesep.c_str()));
119 out = oss.str();
120}
121
122void
123join_lines(vector<string> const & in,
124 string & out)
125{
126 join_lines(in, out, "\n");
127}
128
129void
130prefix_lines_with(string const & prefix, string const & lines, string & out)
131{
132 vector<string> msgs;
133 split_into_lines(lines, msgs);
134
135 ostringstream oss;
136 for (vector<string>::const_iterator i = msgs.begin();
137 i != msgs.end();)
138 {
139 oss << prefix << *i;
140 i++;
141 if (i != msgs.end())
142 oss << '\n';
143 }
144
145 out = oss.str();
146}
147
148void
149append_without_ws(string & appendto, string const & s)
150{
151 unsigned pos = appendto.size();
152 appendto.resize(pos + s.size());
153 for (string::const_iterator i = s.begin();
154 i != s.end(); ++i)
155 {
156 switch (*i)
157 {
158 case '\n':
159 case '\r':
160 case '\t':
161 case ' ':
162 break;
163 default:
164 appendto[pos] = *i;
165 ++pos;
166 break;
167 }
168 }
169 appendto.resize(pos);
170}
171
172string
173remove_ws(string const & s)
174{
175 string tmp;
176 append_without_ws(tmp, s);
177 return tmp;
178}
179
180string
181trim_ws(string const & s)
182{
183 string tmp = s;
184 string::size_type pos = tmp.find_last_not_of("\n\r\t ");
185 if (pos < string::npos)
186 tmp.erase(++pos);
187 pos = tmp.find_first_not_of("\n\r\t ");
188 if (pos < string::npos)
189 tmp = tmp.substr(pos);
190 return tmp;
191}
192
193#ifdef BUILD_UNIT_TESTS
194#include "unit_tests.hh"
195#include "vocab.hh"
196
197UNIT_TEST(simplestring_xform, caseconv)
198{
199 UNIT_TEST_CHECK(uppercase("hello") == "HELLO");
200 UNIT_TEST_CHECK(uppercase("heLlO") == "HELLO");
201 UNIT_TEST_CHECK(lowercase("POODLE DAY") == "poodle day");
202 UNIT_TEST_CHECK(lowercase("PooDLe DaY") == "poodle day");
203 UNIT_TEST_CHECK(uppercase("!@#$%^&*()") == "!@#$%^&*()");
204 UNIT_TEST_CHECK(lowercase("!@#$%^&*()") == "!@#$%^&*()");
205}
206
207UNIT_TEST(simplestring_xform, join_lines)
208{
209 vector<string> strs;
210 string joined;
211
212 strs.clear();
213 join_lines(strs, joined);
214 UNIT_TEST_CHECK(joined == "");
215
216 strs.push_back("hi");
217 join_lines(strs, joined);
218 UNIT_TEST_CHECK(joined == "hi\n");
219
220 strs.push_back("there");
221 join_lines(strs, joined);
222 UNIT_TEST_CHECK(joined == "hi\nthere\n");
223
224 strs.push_back("user");
225 join_lines(strs, joined);
226 UNIT_TEST_CHECK(joined == "hi\nthere\nuser\n");
227}
228
229UNIT_TEST(simplestring_xform, join_words)
230{
231 vector< utf8 > v;
232 set< utf8 > s;
233
234 v.clear();
235 UNIT_TEST_CHECK(join_words(v)() == "");
236
237 v.clear();
238 v.push_back(utf8("a"));
239 UNIT_TEST_CHECK(join_words(v)() == "a");
240 UNIT_TEST_CHECK(join_words(v, ", ")() == "a");
241
242 s.clear();
243 s.insert(utf8("a"));
244 UNIT_TEST_CHECK(join_words(s)() == "a");
245 UNIT_TEST_CHECK(join_words(s, ", ")() == "a");
246
247 v.clear();
248 v.push_back(utf8("a"));
249 v.push_back(utf8("b"));
250 UNIT_TEST_CHECK(join_words(v)() == "a b");
251 UNIT_TEST_CHECK(join_words(v, ", ")() == "a, b");
252
253 s.clear();
254 s.insert(utf8("b"));
255 s.insert(utf8("a"));
256 UNIT_TEST_CHECK(join_words(s)() == "a b");
257 UNIT_TEST_CHECK(join_words(s, ", ")() == "a, b");
258
259 v.clear();
260 v.push_back(utf8("a"));
261 v.push_back(utf8("b"));
262 v.push_back(utf8("c"));
263 UNIT_TEST_CHECK(join_words(v)() == "a b c");
264 UNIT_TEST_CHECK(join_words(v, ", ")() == "a, b, c");
265
266 s.clear();
267 s.insert(utf8("b"));
268 s.insert(utf8("a"));
269 s.insert(utf8("c"));
270 UNIT_TEST_CHECK(join_words(s)() == "a b c");
271 UNIT_TEST_CHECK(join_words(s, ", ")() == "a, b, c");
272}
273
274UNIT_TEST(simplestring_xform, split_into_words)
275{
276 vector< utf8 > words;
277
278 words = split_into_words(utf8(""));
279 UNIT_TEST_CHECK(words.size() == 0);
280
281 words = split_into_words(utf8("foo"));
282 UNIT_TEST_CHECK(words.size() == 1);
283 UNIT_TEST_CHECK(words[0]() == "foo");
284
285 words = split_into_words(utf8("foo bar"));
286 UNIT_TEST_CHECK(words.size() == 2);
287 UNIT_TEST_CHECK(words[0]() == "foo");
288 UNIT_TEST_CHECK(words[1]() == "bar");
289
290 // describe() in commands.cc assumes this behavior. If it ever changes,
291 // remember to modify that function accordingly!
292 words = split_into_words(utf8("foo bar"));
293 UNIT_TEST_CHECK(words.size() == 3);
294 UNIT_TEST_CHECK(words[0]() == "foo");
295 UNIT_TEST_CHECK(words[1]() == "");
296 UNIT_TEST_CHECK(words[2]() == "bar");
297}
298
299UNIT_TEST(simplestring_xform, strip_ws)
300{
301 UNIT_TEST_CHECK(trim_ws("\n leading space") == "leading space");
302 UNIT_TEST_CHECK(trim_ws("trailing space \n") == "trailing space");
303 UNIT_TEST_CHECK(trim_ws("\t\n both \r \n\r\n") == "both");
304 UNIT_TEST_CHECK(remove_ws(" I like going\tfor walks\n ")
305 == "Ilikegoingforwalks");
306}
307
308#endif // BUILD_UNIT_TESTS
309
310// Local Variables:
311// mode: C++
312// fill-column: 76
313// c-file-style: "gnu"
314// indent-tabs-mode: nil
315// End:
316// vim: et:sw=2:sts=2:ts=2:cino=>2s,{s,\:s,+s,t0,g0,^-2,e-2,n-2,p2s,(0,=s:

Archive Download this file

Branches

Tags

Quick Links:     www.monotone.ca    -     Downloads    -     Documentation    -     Wiki    -     Code Forge    -     Build Status