Skip to content

Conversation

@shubhe25p
Copy link
Contributor

@shubhe25p shubhe25p commented Nov 29, 2025

Add %lc support to libc printf by utilizing wcrtomb internal function, also added relevant unit tests. The work has spun up new issues stated below. Resolves #166598

@llvmbot llvmbot added the libc label Nov 29, 2025
@llvmbot
Copy link
Member

llvmbot commented Nov 29, 2025

@llvm/pr-subscribers-libc

Author: Shubh Pachchigar (shubhe25p)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/169983.diff

4 Files Affected:

  • (modified) libc/src/stdio/printf_core/CMakeLists.txt (+3)
  • (modified) libc/src/stdio/printf_core/char_converter.h (+24-4)
  • (modified) libc/test/src/stdio/printf_core/CMakeLists.txt (+1)
  • (modified) libc/test/src/stdio/printf_core/converter_test.cpp (+54-1)
diff --git a/libc/src/stdio/printf_core/CMakeLists.txt b/libc/src/stdio/printf_core/CMakeLists.txt
index 624129b2b36e7..f35a65de1f110 100644
--- a/libc/src/stdio/printf_core/CMakeLists.txt
+++ b/libc/src/stdio/printf_core/CMakeLists.txt
@@ -111,6 +111,7 @@ add_header_library(
     .printf_config
     .writer
     libc.include.inttypes
+    libc.hdr.types.wchar_t
     libc.src.__support.big_int
     libc.src.__support.common
     libc.src.__support.CPP.limits
@@ -123,6 +124,8 @@ add_header_library(
     libc.src.__support.integer_to_string
     libc.src.__support.libc_assert
     libc.src.__support.uint128
+    libc.src.__support.wchar.mbstate
+    libc.src.__support.wchar.wcrtomb
     libc.src.__support.StringUtil.error_to_string
     libc.src.string.memory_utils.inline_memcpy
 )
diff --git a/libc/src/stdio/printf_core/char_converter.h b/libc/src/stdio/printf_core/char_converter.h
index fd2eb2553887a..31dd34fe7a797 100644
--- a/libc/src/stdio/printf_core/char_converter.h
+++ b/libc/src/stdio/printf_core/char_converter.h
@@ -1,4 +1,4 @@
-//===-- String Converter for printf -----------------------------*- C++ -*-===//
+//===-- Character Converter for printf -----------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -9,7 +9,10 @@
 #ifndef LLVM_LIBC_SRC_STDIO_PRINTF_CORE_CHAR_CONVERTER_H
 #define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_CHAR_CONVERTER_H
 
+#include "hdr/types/wchar_t.h"
 #include "src/__support/macros/config.h"
+#include "src/__support/wchar/mbstate.h"
+#include "src/__support/wchar/wcrtomb.h"
 #include "src/stdio/printf_core/converter_utils.h"
 #include "src/stdio/printf_core/core_structs.h"
 #include "src/stdio/printf_core/writer.h"
@@ -20,8 +23,11 @@ namespace printf_core {
 template <WriteMode write_mode>
 LIBC_INLINE int convert_char(Writer<write_mode> *writer,
                              const FormatSection &to_conv) {
-  char c = static_cast<char>(to_conv.conv_val_raw);
-
+  char c;
+  wchar_t wc;
+  char mb_str[MB_LEN_MAX];
+  internal::mbstate internal_mbstate = {0};
+  int ret = 0;
   constexpr int STRING_LEN = 1;
 
   size_t padding_spaces =
@@ -33,7 +39,21 @@ LIBC_INLINE int convert_char(Writer<write_mode> *writer,
     RET_IF_RESULT_NEGATIVE(writer->write(' ', padding_spaces));
   }
 
-  RET_IF_RESULT_NEGATIVE(writer->write(c));
+  if (to_conv.length_modifier == LengthModifier::l) {
+    wc = static_cast<wchar_t>(to_conv.conv_val_raw);
+    ret = internal::wcrtomb(mb_str, wc, &internal_mbstate);
+    if (ret <= 0) {
+      return -1;
+    }
+
+    for (int i = 0; i < ret; i++) {
+      RET_IF_RESULT_NEGATIVE(writer->write(mb_str[i]));
+    }
+
+  } else {
+    c = static_cast<char>(to_conv.conv_val_raw);
+    RET_IF_RESULT_NEGATIVE(writer->write(c));
+  }
 
   // If the padding is on the right side, write the spaces last.
   if (padding_spaces > 0 &&
diff --git a/libc/test/src/stdio/printf_core/CMakeLists.txt b/libc/test/src/stdio/printf_core/CMakeLists.txt
index ff7ebbc4f5fd0..a4c919420777d 100644
--- a/libc/test/src/stdio/printf_core/CMakeLists.txt
+++ b/libc/test/src/stdio/printf_core/CMakeLists.txt
@@ -35,4 +35,5 @@ add_libc_unittest(
     libc.src.stdio.printf_core.converter
     libc.src.stdio.printf_core.writer
     libc.src.stdio.printf_core.core_structs
+    libc.hdr.types.wchar_t
 )
diff --git a/libc/test/src/stdio/printf_core/converter_test.cpp b/libc/test/src/stdio/printf_core/converter_test.cpp
index 2dae2a22c864c..026e36747d4df 100644
--- a/libc/test/src/stdio/printf_core/converter_test.cpp
+++ b/libc/test/src/stdio/printf_core/converter_test.cpp
@@ -9,7 +9,7 @@
 #include "src/stdio/printf_core/converter.h"
 #include "src/stdio/printf_core/core_structs.h"
 #include "src/stdio/printf_core/writer.h"
-
+#include "hdr/types/wchar_t.h"
 #include "test/UnitTest/Test.h"
 
 class LlvmLibcPrintfConverterTest : public LIBC_NAMESPACE::testing::Test {
@@ -255,3 +255,56 @@ TEST_F(LlvmLibcPrintfConverterTest, OctConversion) {
   ASSERT_STREQ(str, "1234");
   ASSERT_EQ(writer.get_chars_written(), size_t{4});
 }
+
+TEST_F(LlvmLibcPrintfConverterTest, WideCharConversion) {
+
+  LIBC_NAMESPACE::printf_core::FormatSection section;
+  section.has_conv = true;
+  section.raw_string = "%c";
+  section.conv_name = 'c';
+  section.length_modifier = LIBC_NAMESPACE::printf_core::LengthModifier::l;
+  section.conv_val_raw = static_cast<wchar_t>(L'S');
+
+  LIBC_NAMESPACE::printf_core::convert(&writer, section);
+
+  wb.buff[wb.buff_cur] = '\0';
+
+  ASSERT_STREQ(str, "S");
+  ASSERT_EQ(writer.get_chars_written(), size_t{1});
+}
+
+TEST_F(LlvmLibcPrintfConverterTest, WideCharConversionLeftJustified) {
+  LIBC_NAMESPACE::printf_core::FormatSection left_justified_conv;
+  left_justified_conv.has_conv = true;
+  left_justified_conv.raw_string = "%-4c";
+  left_justified_conv.conv_name = 'c';
+  left_justified_conv.length_modifier =
+      LIBC_NAMESPACE::printf_core::LengthModifier::l;
+  left_justified_conv.flags =
+      LIBC_NAMESPACE::printf_core::FormatFlags::LEFT_JUSTIFIED;
+  left_justified_conv.min_width = 4;
+  left_justified_conv.conv_val_raw = static_cast<wchar_t>(L'S');
+
+  LIBC_NAMESPACE::printf_core::convert(&writer, left_justified_conv);
+  wb.buff[wb.buff_cur] = '\0';
+
+  ASSERT_STREQ(str, "S   ");
+  ASSERT_EQ(writer.get_chars_written(), size_t{4});
+}
+
+TEST_F(LlvmLibcPrintfConverterTest, WideCharConversionRightJustified) {
+  LIBC_NAMESPACE::printf_core::FormatSection right_justified_conv;
+  right_justified_conv.has_conv = true;
+  right_justified_conv.raw_string = "%4c";
+  right_justified_conv.conv_name = 'c';
+  right_justified_conv.length_modifier =
+      LIBC_NAMESPACE::printf_core::LengthModifier::l;
+  right_justified_conv.min_width = 4;
+  right_justified_conv.conv_val_raw = static_cast<wchar_t>(L'S');
+
+  LIBC_NAMESPACE::printf_core::convert(&writer, right_justified_conv);
+  wb.buff[wb.buff_cur] = '\0';
+
+  ASSERT_STREQ(str, "   S");
+  ASSERT_EQ(writer.get_chars_written(), size_t{4});
+}

@github-actions
Copy link

github-actions bot commented Nov 29, 2025

✅ With the latest revision this PR passed the C/C++ code formatter.

@shubhe25p
Copy link
Contributor Author

shubhe25p commented Nov 29, 2025

This PR has spun up three new interesting directions, each of which seems feasible with some guidance. I am not sure which one can be pulled into this PR and if it is even an issue. Let me know your thoughts @michaelrj-google:

  1. wchar is a distinct integral type in C++ standard but BigInt guard clause will throw an error as wchar is not recognized. Here I have a workaround where I cast to unsigned int. So the work will involve adding wchar to BigInt.
  2. Adding wchar support on windows, currently a static assert in wcrtomb causes a build failure on windows platforms and work will involve adding UTF-16 support to wcrtomb
  3. %ls support in printf, which is a natural extension to this work and would involve a looping over a sequence of wchar

const FormatSection &to_conv) {
char c = static_cast<char>(to_conv.conv_val_raw);

char c;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please declare local variables in the scope where they're needed / combine this with initialization when possible.

That is, retain char c = static_cast<char>(...) in the else-block below, only declare local arrays / mbstate in the block for ell-specifier etc.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you so much for the feedback, I will update it.

RET_IF_RESULT_NEGATIVE(writer->write(c));
if (to_conv.length_modifier == LengthModifier::l) {
wc = static_cast<wchar_t>(static_cast<unsigned int>(to_conv.conv_val_raw));
ret = internal::wcrtomb(mb_str, wc, &internal_mbstate);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How is this even working, given that internal::wcrtomb returns ErrorOr<size_t> ? Is it possible that your
int ret = internal::wcrtomb(...) works by round-tripping through operator bool conversion for ErrorOr, and thus will always return 0 or 1?

I don't see any tests with wint_t that would be represented as multiple bytes, or wint_t which can't represent a valid Unicode character (e.g. 0x12ffff), they should probably be added.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes exactly, I never realized this, thank you so much. My test was only using a wint_t which is only a single byte long so it worked. I will add more tests.

return -1;
}

for (int i = 0; i < ret; i++) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Writer has an API to write a buffer (cpp::string_view), I believe you can call it as writer->write({mb_str, mb_len})

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

True it does, I will update the code.

}

TEST_F(LlvmLibcPrintfConverterTest, WideCharConversion) {

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

remove empty line


LIBC_NAMESPACE::printf_core::FormatSection section;
section.has_conv = true;
section.raw_string = "%c";
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Make your raw_string consistent with its broken down into - e.g. here it must be "%lc". Same for test cases below..

libc.src.stdio.printf_core.converter
libc.src.stdio.printf_core.writer
libc.src.stdio.printf_core.core_structs
libc.hdr.types.wchar_t
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please move this above libc.src. dependencies

section.length_modifier = LIBC_NAMESPACE::printf_core::LengthModifier::l;
section.conv_val_raw = static_cast<wchar_t>(L'S');

LIBC_NAMESPACE::printf_core::convert(&writer, section);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you should also consider adding end-to-end test of this functionality - e.g. a test of snprintf invocation which prints converted wide characters to a multi-byte sting, and a test case in libc/test/src/stdio/printf_core/parser_test.cpp to ensure that %lc is parsed correctly.

For example, the Parser should accept the arguments of type wint_t in this case, according to specification - so probably you'd need to update the parsing code.

char c;
wchar_t wc;
char mb_str[MB_LEN_MAX];
internal::mbstate internal_mbstate = {0};
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: it can be just "mbstate" variable, there's nothing particularly internal about it. Also, its default constructor would do the right thing already.


RET_IF_RESULT_NEGATIVE(writer->write(c));
if (to_conv.length_modifier == LengthModifier::l) {
wc = static_cast<wchar_t>(static_cast<unsigned int>(to_conv.conv_val_raw));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The argument type would be wint_t per standard, so you might need to handle WEOF and then static_cast.

You mentioned BigInt problems in the review thread, could you elaborate how are these relevant?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks this makes sense, what I was referring to was that naively casting from conv_val_raw (BigInt) to wchar_t leads to an error in BigInt to(), as the casting has a guard clause that essentially checks:

struct BigInt {
...
template <typename T>
  LIBC_INLINE constexpr cpp::enable_if_t<
      cpp::is_integral_v<T> && !cpp::is_same_v<T, bool>, T>
  to() const { 
       ... }

is_integral_v will return false if the type is not one of:

char, signed char, unsigned char, short, unsigned short, int,
      unsigned int, long, unsigned long, long long, unsigned long long, bool

That is the reason why I cast to unsigned int first. Let me know your thoughts

@vonosmas
Copy link
Contributor

vonosmas commented Dec 1, 2025

Thank you for starting to work on this!

This PR has spun up three new interesting directions, each of which seems feasible with some guidance. I am not sure which one can be pulled into this PR and if it is even an issue. Let me know your thoughts @michaelrj-google:

  1. wchar is a distinct integral type in C++ standard but BigInt guard clause will throw an error as wchar is not recognized. Here I have a workaround where I cast to unsigned int. So the work will involve adding wchar to BigInt.

See comment in CL - I think you need to roundtrip the input arguments through wint_t.

  1. Adding wchar support on windows, currently a static assert in wcrtomb causes a build failure on windows platforms and work will involve adding UTF-16 support to wcrtomb

Could you figure out why is our UTF-32-only wcrtomb gets pulled in by this change? I.e. is there a particular function that is enabled on Windows and depends on the printf functionality?

  1. %ls support in printf, which is a natural extension to this work and would involve a looping over a sequence of wchar

That sounds like a natural follow-up!

@shubhe25p shubhe25p marked this pull request as draft December 3, 2025 06:35
@shubhe25p shubhe25p force-pushed the br_printf_lc_support branch from c2a7797 to 7a33e23 Compare December 3, 2025 08:45
@shubhe25p shubhe25p closed this Dec 3, 2025
@shubhe25p shubhe25p reopened this Dec 3, 2025
@github-actions
Copy link

github-actions bot commented Dec 3, 2025

🐧 Linux x64 Test Results

✅ The build succeeded and no tests ran. This is expected in some build configurations.

@@ -33,7 +37,27 @@ LIBC_INLINE int convert_char(Writer<write_mode> *writer,
RET_IF_RESULT_NEGATIVE(writer->write(' ', padding_spaces));
}

RET_IF_RESULT_NEGATIVE(writer->write(c));
if (to_conv.length_modifier == LengthModifier::l) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this looks good overall, but we'll want to have a compile flag to disable wide characters to save code size. Could you add something similar to LIBC_COPT_PRINTF_DISABLE_STRERROR that just disables the wide character portion? You'll also need to guard the includes at the top of this file and the tests.

@@ -255,3 +256,83 @@ TEST_F(LlvmLibcPrintfConverterTest, OctConversion) {
ASSERT_STREQ(str, "1234");
ASSERT_EQ(writer.get_chars_written(), size_t{4});
}

TEST_F(LlvmLibcPrintfConverterTest, WideCharConversion) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For simplicity most of the functionality testing of the printf conversions is actually done in sprintf_test.cpp. Doing it that way makes the tests less verbose and more accurate to their actual use. I would recommend removing the tests from converter_test.cpp and only having the wide character tests in sprintf_test.cpp.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

Projects

None yet

Development

Successfully merging this pull request may close these issues.

[libc] Support %lc in printf

4 participants