mirror of https://github.com/Mai-with-u/MaiBot.git
Squashed 'plugins/tts_voice_plugin/' content from commit d14ba1bd
git-subtree-dir: plugins/tts_voice_plugin git-subtree-split: d14ba1bdf00b09521a4eab8fd66ee83c64f2314cpull/1506/head
commit
0873d9e688
|
|
@ -0,0 +1,40 @@
|
|||
# 敏感配置文件
|
||||
config.toml
|
||||
config.toml.backup.*
|
||||
config.toml.reset.*
|
||||
|
||||
# Python缓存文件
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
*.so
|
||||
|
||||
# 虚拟环境
|
||||
venv/
|
||||
ENV/
|
||||
env/
|
||||
|
||||
# IDE配置
|
||||
.vscode/
|
||||
.idea/
|
||||
*.swp
|
||||
*.swo
|
||||
|
||||
# 临时文件
|
||||
*.log
|
||||
*.tmp
|
||||
.DS_Store
|
||||
|
||||
# 生成的音频文件
|
||||
tts_*.mp3
|
||||
tts_*.wav
|
||||
tts_*.ogg
|
||||
|
||||
# 数据目录(包含临时音频文件)
|
||||
data/
|
||||
|
||||
# 规范工作流目录
|
||||
.spec-workflow/
|
||||
|
||||
# Claude配置
|
||||
.claude/
|
||||
|
|
@ -0,0 +1,661 @@
|
|||
GNU AFFERO GENERAL PUBLIC LICENSE
|
||||
Version 3, 19 November 2007
|
||||
|
||||
Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
|
||||
Everyone is permitted to copy and distribute verbatim copies
|
||||
of this license document, but changing it is not allowed.
|
||||
|
||||
Preamble
|
||||
|
||||
The GNU Affero General Public License is a free, copyleft license for
|
||||
software and other kinds of works, specifically designed to ensure
|
||||
cooperation with the community in the case of network server software.
|
||||
|
||||
The licenses for most software and other practical works are designed
|
||||
to take away your freedom to share and change the works. By contrast,
|
||||
our General Public Licenses are intended to guarantee your freedom to
|
||||
share and change all versions of a program--to make sure it remains free
|
||||
software for all its users.
|
||||
|
||||
When we speak of free software, we are referring to freedom, not
|
||||
price. Our General Public Licenses are designed to make sure that you
|
||||
have the freedom to distribute copies of free software (and charge for
|
||||
them if you wish), that you receive source code or can get it if you
|
||||
want it, that you can change the software or use pieces of it in new
|
||||
free programs, and that you know you can do these things.
|
||||
|
||||
Developers that use our General Public Licenses protect your rights
|
||||
with two steps: (1) assert copyright on the software, and (2) offer
|
||||
you this License which gives you legal permission to copy, distribute
|
||||
and/or modify the software.
|
||||
|
||||
A secondary benefit of defending all users' freedom is that
|
||||
improvements made in alternate versions of the program, if they
|
||||
receive widespread use, become available for other developers to
|
||||
incorporate. Many developers of free software are heartened and
|
||||
encouraged by the resulting cooperation. However, in the case of
|
||||
software used on network servers, this result may fail to come about.
|
||||
The GNU General Public License permits making a modified version and
|
||||
letting the public access it on a server without ever releasing its
|
||||
source code to the public.
|
||||
|
||||
The GNU Affero General Public License is designed specifically to
|
||||
ensure that, in such cases, the modified source code becomes available
|
||||
to the community. It requires the operator of a network server to
|
||||
provide the source code of the modified version running there to the
|
||||
users of that server. Therefore, public use of a modified version, on
|
||||
a publicly accessible server, gives the public access to the source
|
||||
code of the modified version.
|
||||
|
||||
An older license, called the Affero General Public License and
|
||||
published by Affero, was designed to accomplish similar goals. This is
|
||||
a different license, not a version of the Affero GPL, but Affero has
|
||||
released a new version of the Affero GPL which permits relicensing under
|
||||
this license.
|
||||
|
||||
The precise terms and conditions for copying, distribution and
|
||||
modification follow.
|
||||
|
||||
TERMS AND CONDITIONS
|
||||
|
||||
0. Definitions.
|
||||
|
||||
"This License" refers to version 3 of the GNU Affero General Public License.
|
||||
|
||||
"Copyright" also means copyright-like laws that apply to other kinds of
|
||||
works, such as semiconductor masks.
|
||||
|
||||
"The Program" refers to any copyrightable work licensed under this
|
||||
License. Each licensee is addressed as "you". "Licensees" and
|
||||
"recipients" may be individuals or organizations.
|
||||
|
||||
To "modify" a work means to copy from or adapt all or part of the work
|
||||
in a fashion requiring copyright permission, other than the making of an
|
||||
exact copy. The resulting work is called a "modified version" of the
|
||||
earlier work or a work "based on" the earlier work.
|
||||
|
||||
A "covered work" means either the unmodified Program or a work based
|
||||
on the Program.
|
||||
|
||||
To "propagate" a work means to do anything with it that, without
|
||||
permission, would make you directly or secondarily liable for
|
||||
infringement under applicable copyright law, except executing it on a
|
||||
computer or modifying a private copy. Propagation includes copying,
|
||||
distribution (with or without modification), making available to the
|
||||
public, and in some countries other activities as well.
|
||||
|
||||
To "convey" a work means any kind of propagation that enables other
|
||||
parties to make or receive copies. Mere interaction with a user through
|
||||
a computer network, with no transfer of a copy, is not conveying.
|
||||
|
||||
An interactive user interface displays "Appropriate Legal Notices"
|
||||
to the extent that it includes a convenient and prominently visible
|
||||
feature that (1) displays an appropriate copyright notice, and (2)
|
||||
tells the user that there is no warranty for the work (except to the
|
||||
extent that warranties are provided), that licensees may convey the
|
||||
work under this License, and how to view a copy of this License. If
|
||||
the interface presents a list of user commands or options, such as a
|
||||
menu, a prominent item in the list meets this criterion.
|
||||
|
||||
1. Source Code.
|
||||
|
||||
The "source code" for a work means the preferred form of the work
|
||||
for making modifications to it. "Object code" means any non-source
|
||||
form of a work.
|
||||
|
||||
A "Standard Interface" means an interface that either is an official
|
||||
standard defined by a recognized standards body, or, in the case of
|
||||
interfaces specified for a particular programming language, one that
|
||||
is widely used among developers working in that language.
|
||||
|
||||
The "System Libraries" of an executable work include anything, other
|
||||
than the work as a whole, that (a) is included in the normal form of
|
||||
packaging a Major Component, but which is not part of that Major
|
||||
Component, and (b) serves only to enable use of the work with that
|
||||
Major Component, or to implement a Standard Interface for which an
|
||||
implementation is available to the public in source code form. A
|
||||
"Major Component", in this context, means a major essential component
|
||||
(kernel, window system, and so on) of the specific operating system
|
||||
(if any) on which the executable work runs, or a compiler used to
|
||||
produce the work, or an object code interpreter used to run it.
|
||||
|
||||
The "Corresponding Source" for a work in object code form means all
|
||||
the source code needed to generate, install, and (for an executable
|
||||
work) run the object code and to modify the work, including scripts to
|
||||
control those activities. However, it does not include the work's
|
||||
System Libraries, or general-purpose tools or generally available free
|
||||
programs which are used unmodified in performing those activities but
|
||||
which are not part of the work. For example, Corresponding Source
|
||||
includes interface definition files associated with source files for
|
||||
the work, and the source code for shared libraries and dynamically
|
||||
linked subprograms that the work is specifically designed to require,
|
||||
such as by intimate data communication or control flow between those
|
||||
subprograms and other parts of the work.
|
||||
|
||||
The Corresponding Source need not include anything that users
|
||||
can regenerate automatically from other parts of the Corresponding
|
||||
Source.
|
||||
|
||||
The Corresponding Source for a work in source code form is that
|
||||
same work.
|
||||
|
||||
2. Basic Permissions.
|
||||
|
||||
All rights granted under this License are granted for the term of
|
||||
copyright on the Program, and are irrevocable provided the stated
|
||||
conditions are met. This License explicitly affirms your unlimited
|
||||
permission to run the unmodified Program. The output from running a
|
||||
covered work is covered by this License only if the output, given its
|
||||
content, constitutes a covered work. This License acknowledges your
|
||||
rights of fair use or other equivalent, as provided by copyright law.
|
||||
|
||||
You may make, run and propagate covered works that you do not
|
||||
convey, without conditions so long as your license otherwise remains
|
||||
in force. You may convey covered works to others for the sole purpose
|
||||
of having them make modifications exclusively for you, or provide you
|
||||
with facilities for running those works, provided that you comply with
|
||||
the terms of this License in conveying all material for which you do
|
||||
not control copyright. Those thus making or running the covered works
|
||||
for you must do so exclusively on your behalf, under your direction
|
||||
and control, on terms that prohibit them from making any copies of
|
||||
your copyrighted material outside their relationship with you.
|
||||
|
||||
Conveying under any other circumstances is permitted solely under
|
||||
the conditions stated below. Sublicensing is not allowed; section 10
|
||||
makes it unnecessary.
|
||||
|
||||
3. Protecting Users' Legal Rights From Anti-Circumvention Law.
|
||||
|
||||
No covered work shall be deemed part of an effective technological
|
||||
measure under any applicable law fulfilling obligations under article
|
||||
11 of the WIPO copyright treaty adopted on 20 December 1996, or
|
||||
similar laws prohibiting or restricting circumvention of such
|
||||
measures.
|
||||
|
||||
When you convey a covered work, you waive any legal power to forbid
|
||||
circumvention of technological measures to the extent such circumvention
|
||||
is effected by exercising rights under this License with respect to
|
||||
the covered work, and you disclaim any intention to limit operation or
|
||||
modification of the work as a means of enforcing, against the work's
|
||||
users, your or third parties' legal rights to forbid circumvention of
|
||||
technological measures.
|
||||
|
||||
4. Conveying Verbatim Copies.
|
||||
|
||||
You may convey verbatim copies of the Program's source code as you
|
||||
receive it, in any medium, provided that you conspicuously and
|
||||
appropriately publish on each copy an appropriate copyright notice;
|
||||
keep intact all notices stating that this License and any
|
||||
non-permissive terms added in accord with section 7 apply to the code;
|
||||
keep intact all notices of the absence of any warranty; and give all
|
||||
recipients a copy of this License along with the Program.
|
||||
|
||||
You may charge any price or no price for each copy that you convey,
|
||||
and you may offer support or warranty protection for a fee.
|
||||
|
||||
5. Conveying Modified Source Versions.
|
||||
|
||||
You may convey a work based on the Program, or the modifications to
|
||||
produce it from the Program, in the form of source code under the
|
||||
terms of section 4, provided that you also meet all of these conditions:
|
||||
|
||||
a) The work must carry prominent notices stating that you modified
|
||||
it, and giving a relevant date.
|
||||
|
||||
b) The work must carry prominent notices stating that it is
|
||||
released under this License and any conditions added under section
|
||||
7. This requirement modifies the requirement in section 4 to
|
||||
"keep intact all notices".
|
||||
|
||||
c) You must license the entire work, as a whole, under this
|
||||
License to anyone who comes into possession of a copy. This
|
||||
License will therefore apply, along with any applicable section 7
|
||||
additional terms, to the whole of the work, and all its parts,
|
||||
regardless of how they are packaged. This License gives no
|
||||
permission to license the work in any other way, but it does not
|
||||
invalidate such permission if you have separately received it.
|
||||
|
||||
d) If the work has interactive user interfaces, each must display
|
||||
Appropriate Legal Notices; however, if the Program has interactive
|
||||
interfaces that do not display Appropriate Legal Notices, your
|
||||
work need not make them do so.
|
||||
|
||||
A compilation of a covered work with other separate and independent
|
||||
works, which are not by their nature extensions of the covered work,
|
||||
and which are not combined with it such as to form a larger program,
|
||||
in or on a volume of a storage or distribution medium, is called an
|
||||
"aggregate" if the compilation and its resulting copyright are not
|
||||
used to limit the access or legal rights of the compilation's users
|
||||
beyond what the individual works permit. Inclusion of a covered work
|
||||
in an aggregate does not cause this License to apply to the other
|
||||
parts of the aggregate.
|
||||
|
||||
6. Conveying Non-Source Forms.
|
||||
|
||||
You may convey a covered work in object code form under the terms
|
||||
of sections 4 and 5, provided that you also convey the
|
||||
machine-readable Corresponding Source under the terms of this License,
|
||||
in one of these ways:
|
||||
|
||||
a) Convey the object code in, or embodied in, a physical product
|
||||
(including a physical distribution medium), accompanied by the
|
||||
Corresponding Source fixed on a durable physical medium
|
||||
customarily used for software interchange.
|
||||
|
||||
b) Convey the object code in, or embodied in, a physical product
|
||||
(including a physical distribution medium), accompanied by a
|
||||
written offer, valid for at least three years and valid for as
|
||||
long as you offer spare parts or customer support for that product
|
||||
model, to give anyone who possesses the object code either (1) a
|
||||
copy of the Corresponding Source for all the software in the
|
||||
product that is covered by this License, on a durable physical
|
||||
medium customarily used for software interchange, for a price no
|
||||
more than your reasonable cost of physically performing this
|
||||
conveying of source, or (2) access to copy the
|
||||
Corresponding Source from a network server at no charge.
|
||||
|
||||
c) Convey individual copies of the object code with a copy of the
|
||||
written offer to provide the Corresponding Source. This
|
||||
alternative is allowed only occasionally and noncommercially, and
|
||||
only if you received the object code with such an offer, in accord
|
||||
with subsection 6b.
|
||||
|
||||
d) Convey the object code by offering access from a designated
|
||||
place (gratis or for a charge), and offer equivalent access to the
|
||||
Corresponding Source in the same way through the same place at no
|
||||
further charge. You need not require recipients to copy the
|
||||
Corresponding Source along with the object code. If the place to
|
||||
copy the object code is a network server, the Corresponding Source
|
||||
may be on a different server (operated by you or a third party)
|
||||
that supports equivalent copying facilities, provided you maintain
|
||||
clear directions next to the object code saying where to find the
|
||||
Corresponding Source. Regardless of what server hosts the
|
||||
Corresponding Source, you remain obligated to ensure that it is
|
||||
available for as long as needed to satisfy these requirements.
|
||||
|
||||
e) Convey the object code using peer-to-peer transmission, provided
|
||||
you inform other peers where the object code and Corresponding
|
||||
Source of the work are being offered to the general public at no
|
||||
charge under subsection 6d.
|
||||
|
||||
A separable portion of the object code, whose source code is excluded
|
||||
from the Corresponding Source as a System Library, need not be
|
||||
included in conveying the object code work.
|
||||
|
||||
A "User Product" is either (1) a "consumer product", which means any
|
||||
tangible personal property which is normally used for personal, family,
|
||||
or household purposes, or (2) anything designed or sold for incorporation
|
||||
into a dwelling. In determining whether a product is a consumer product,
|
||||
doubtful cases shall be resolved in favor of coverage. For a particular
|
||||
product received by a particular user, "normally used" refers to a
|
||||
typical or common use of that class of product, regardless of the status
|
||||
of the particular user or of the way in which the particular user
|
||||
actually uses, or expects or is expected to use, the product. A product
|
||||
is a consumer product regardless of whether the product has substantial
|
||||
commercial, industrial or non-consumer uses, unless such uses represent
|
||||
the only significant mode of use of the product.
|
||||
|
||||
"Installation Information" for a User Product means any methods,
|
||||
procedures, authorization keys, or other information required to install
|
||||
and execute modified versions of a covered work in that User Product from
|
||||
a modified version of its Corresponding Source. The information must
|
||||
suffice to ensure that the continued functioning of the modified object
|
||||
code is in no case prevented or interfered with solely because
|
||||
modification has been made.
|
||||
|
||||
If you convey an object code work under this section in, or with, or
|
||||
specifically for use in, a User Product, and the conveying occurs as
|
||||
part of a transaction in which the right of possession and use of the
|
||||
User Product is transferred to the recipient in perpetuity or for a
|
||||
fixed term (regardless of how the transaction is characterized), the
|
||||
Corresponding Source conveyed under this section must be accompanied
|
||||
by the Installation Information. But this requirement does not apply
|
||||
if neither you nor any third party retains the ability to install
|
||||
modified object code on the User Product (for example, the work has
|
||||
been installed in ROM).
|
||||
|
||||
The requirement to provide Installation Information does not include a
|
||||
requirement to continue to provide support service, warranty, or updates
|
||||
for a work that has been modified or installed by the recipient, or for
|
||||
the User Product in which it has been modified or installed. Access to a
|
||||
network may be denied when the modification itself materially and
|
||||
adversely affects the operation of the network or violates the rules and
|
||||
protocols for communication across the network.
|
||||
|
||||
Corresponding Source conveyed, and Installation Information provided,
|
||||
in accord with this section must be in a format that is publicly
|
||||
documented (and with an implementation available to the public in
|
||||
source code form), and must require no special password or key for
|
||||
unpacking, reading or copying.
|
||||
|
||||
7. Additional Terms.
|
||||
|
||||
"Additional permissions" are terms that supplement the terms of this
|
||||
License by making exceptions from one or more of its conditions.
|
||||
Additional permissions that are applicable to the entire Program shall
|
||||
be treated as though they were included in this License, to the extent
|
||||
that they are valid under applicable law. If additional permissions
|
||||
apply only to part of the Program, that part may be used separately
|
||||
under those permissions, but the entire Program remains governed by
|
||||
this License without regard to the additional permissions.
|
||||
|
||||
When you convey a copy of a covered work, you may at your option
|
||||
remove any additional permissions from that copy, or from any part of
|
||||
it. (Additional permissions may be written to require their own
|
||||
removal in certain cases when you modify the work.) You may place
|
||||
additional permissions on material, added by you to a covered work,
|
||||
for which you have or can give appropriate copyright permission.
|
||||
|
||||
Notwithstanding any other provision of this License, for material you
|
||||
add to a covered work, you may (if authorized by the copyright holders of
|
||||
that material) supplement the terms of this License with terms:
|
||||
|
||||
a) Disclaiming warranty or limiting liability differently from the
|
||||
terms of sections 15 and 16 of this License; or
|
||||
|
||||
b) Requiring preservation of specified reasonable legal notices or
|
||||
author attributions in that material or in the Appropriate Legal
|
||||
Notices displayed by works containing it; or
|
||||
|
||||
c) Prohibiting misrepresentation of the origin of that material, or
|
||||
requiring that modified versions of such material be marked in
|
||||
reasonable ways as different from the original version; or
|
||||
|
||||
d) Limiting the use for publicity purposes of names of licensors or
|
||||
authors of the material; or
|
||||
|
||||
e) Declining to grant rights under trademark law for use of some
|
||||
trade names, trademarks, or service marks; or
|
||||
|
||||
f) Requiring indemnification of licensors and authors of that
|
||||
material by anyone who conveys the material (or modified versions of
|
||||
it) with contractual assumptions of liability to the recipient, for
|
||||
any liability that these contractual assumptions directly impose on
|
||||
those licensors and authors.
|
||||
|
||||
All other non-permissive additional terms are considered "further
|
||||
restrictions" within the meaning of section 10. If the Program as you
|
||||
received it, or any part of it, contains a notice stating that it is
|
||||
governed by this License along with a term that is a further
|
||||
restriction, you may remove that term. If a license document contains
|
||||
a further restriction but permits relicensing or conveying under this
|
||||
License, you may add to a covered work material governed by the terms
|
||||
of that license document, provided that the further restriction does
|
||||
not survive such relicensing or conveying.
|
||||
|
||||
If you add terms to a covered work in accord with this section, you
|
||||
must place, in the relevant source files, a statement of the
|
||||
additional terms that apply to those files, or a notice indicating
|
||||
where to find the applicable terms.
|
||||
|
||||
Additional terms, permissive or non-permissive, may be stated in the
|
||||
form of a separately written license, or stated as exceptions;
|
||||
the above requirements apply either way.
|
||||
|
||||
8. Termination.
|
||||
|
||||
You may not propagate or modify a covered work except as expressly
|
||||
provided under this License. Any attempt otherwise to propagate or
|
||||
modify it is void, and will automatically terminate your rights under
|
||||
this License (including any patent licenses granted under the third
|
||||
paragraph of section 11).
|
||||
|
||||
However, if you cease all violation of this License, then your
|
||||
license from a particular copyright holder is reinstated (a)
|
||||
provisionally, unless and until the copyright holder explicitly and
|
||||
finally terminates your license, and (b) permanently, if the copyright
|
||||
holder fails to notify you of the violation by some reasonable means
|
||||
prior to 60 days after the cessation.
|
||||
|
||||
Moreover, your license from a particular copyright holder is
|
||||
reinstated permanently if the copyright holder notifies you of the
|
||||
violation by some reasonable means, this is the first time you have
|
||||
received notice of violation of this License (for any work) from that
|
||||
copyright holder, and you cure the violation prior to 30 days after
|
||||
your receipt of the notice.
|
||||
|
||||
Termination of your rights under this section does not terminate the
|
||||
licenses of parties who have received copies or rights from you under
|
||||
this License. If your rights have been terminated and not permanently
|
||||
reinstated, you do not qualify to receive new licenses for the same
|
||||
material under section 10.
|
||||
|
||||
9. Acceptance Not Required for Having Copies.
|
||||
|
||||
You are not required to accept this License in order to receive or
|
||||
run a copy of the Program. Ancillary propagation of a covered work
|
||||
occurring solely as a consequence of using peer-to-peer transmission
|
||||
to receive a copy likewise does not require acceptance. However,
|
||||
nothing other than this License grants you permission to propagate or
|
||||
modify any covered work. These actions infringe copyright if you do
|
||||
not accept this License. Therefore, by modifying or propagating a
|
||||
covered work, you indicate your acceptance of this License to do so.
|
||||
|
||||
10. Automatic Licensing of Downstream Recipients.
|
||||
|
||||
Each time you convey a covered work, the recipient automatically
|
||||
receives a license from the original licensors, to run, modify and
|
||||
propagate that work, subject to this License. You are not responsible
|
||||
for enforcing compliance by third parties with this License.
|
||||
|
||||
An "entity transaction" is a transaction transferring control of an
|
||||
organization, or substantially all assets of one, or subdividing an
|
||||
organization, or merging organizations. If propagation of a covered
|
||||
work results from an entity transaction, each party to that
|
||||
transaction who receives a copy of the work also receives whatever
|
||||
licenses to the work the party's predecessor in interest had or could
|
||||
give under the previous paragraph, plus a right to possession of the
|
||||
Corresponding Source of the work from the predecessor in interest, if
|
||||
the predecessor has it or can get it with reasonable efforts.
|
||||
|
||||
You may not impose any further restrictions on the exercise of the
|
||||
rights granted or affirmed under this License. For example, you may
|
||||
not impose a license fee, royalty, or other charge for exercise of
|
||||
rights granted under this License, and you may not initiate litigation
|
||||
(including a cross-claim or counterclaim in a lawsuit) alleging that
|
||||
any patent claim is infringed by making, using, selling, offering for
|
||||
sale, or importing the Program or any portion of it.
|
||||
|
||||
11. Patents.
|
||||
|
||||
A "contributor" is a copyright holder who authorizes use under this
|
||||
License of the Program or a work on which the Program is based. The
|
||||
work thus licensed is called the contributor's "contributor version".
|
||||
|
||||
A contributor's "essential patent claims" are all patent claims
|
||||
owned or controlled by the contributor, whether already acquired or
|
||||
hereafter acquired, that would be infringed by some manner, permitted
|
||||
by this License, of making, using, or selling its contributor version,
|
||||
but do not include claims that would be infringed only as a
|
||||
consequence of further modification of the contributor version. For
|
||||
purposes of this definition, "control" includes the right to grant
|
||||
patent sublicenses in a manner consistent with the requirements of
|
||||
this License.
|
||||
|
||||
Each contributor grants you a non-exclusive, worldwide, royalty-free
|
||||
patent license under the contributor's essential patent claims, to
|
||||
make, use, sell, offer for sale, import and otherwise run, modify and
|
||||
propagate the contents of its contributor version.
|
||||
|
||||
In the following three paragraphs, a "patent license" is any express
|
||||
agreement or commitment, however denominated, not to enforce a patent
|
||||
(such as an express permission to practice a patent or covenant not to
|
||||
sue for patent infringement). To "grant" such a patent license to a
|
||||
party means to make such an agreement or commitment not to enforce a
|
||||
patent against the party.
|
||||
|
||||
If you convey a covered work, knowingly relying on a patent license,
|
||||
and the Corresponding Source of the work is not available for anyone
|
||||
to copy, free of charge and under the terms of this License, through a
|
||||
publicly available network server or other readily accessible means,
|
||||
then you must either (1) cause the Corresponding Source to be so
|
||||
available, or (2) arrange to deprive yourself of the benefit of the
|
||||
patent license for this particular work, or (3) arrange, in a manner
|
||||
consistent with the requirements of this License, to extend the patent
|
||||
license to downstream recipients. "Knowingly relying" means you have
|
||||
actual knowledge that, but for the patent license, your conveying the
|
||||
covered work in a country, or your recipient's use of the covered work
|
||||
in a country, would infringe one or more identifiable patents in that
|
||||
country that you have reason to believe are valid.
|
||||
|
||||
If, pursuant to or in connection with a single transaction or
|
||||
arrangement, you convey, or propagate by procuring conveyance of, a
|
||||
covered work, and grant a patent license to some of the parties
|
||||
receiving the covered work authorizing them to use, propagate, modify
|
||||
or convey a specific copy of the covered work, then the patent license
|
||||
you grant is automatically extended to all recipients of the covered
|
||||
work and works based on it.
|
||||
|
||||
A patent license is "discriminatory" if it does not include within
|
||||
the scope of its coverage, prohibits the exercise of, or is
|
||||
conditioned on the non-exercise of one or more of the rights that are
|
||||
specifically granted under this License. You may not convey a covered
|
||||
work if you are a party to an arrangement with a third party that is
|
||||
in the business of distributing software, under which you make payment
|
||||
to the third party based on the extent of your activity of conveying
|
||||
the work, and under which the third party grants, to any of the
|
||||
parties who would receive the covered work from you, a discriminatory
|
||||
patent license (a) in connection with copies of the covered work
|
||||
conveyed by you (or copies made from those copies), or (b) primarily
|
||||
for and in connection with specific products or compilations that
|
||||
contain the covered work, unless you entered into that arrangement,
|
||||
or that patent license was granted, prior to 28 March 2007.
|
||||
|
||||
Nothing in this License shall be construed as excluding or limiting
|
||||
any implied license or other defenses to infringement that may
|
||||
otherwise be available to you under applicable patent law.
|
||||
|
||||
12. No Surrender of Others' Freedom.
|
||||
|
||||
If conditions are imposed on you (whether by court order, agreement or
|
||||
otherwise) that contradict the conditions of this License, they do not
|
||||
excuse you from the conditions of this License. If you cannot convey a
|
||||
covered work so as to satisfy simultaneously your obligations under this
|
||||
License and any other pertinent obligations, then as a consequence you may
|
||||
not convey it at all. For example, if you agree to terms that obligate you
|
||||
to collect a royalty for further conveying from those to whom you convey
|
||||
the Program, the only way you could satisfy both those terms and this
|
||||
License would be to refrain entirely from conveying the Program.
|
||||
|
||||
13. Remote Network Interaction; Use with the GNU General Public License.
|
||||
|
||||
Notwithstanding any other provision of this License, if you modify the
|
||||
Program, your modified version must prominently offer all users
|
||||
interacting with it remotely through a computer network (if your version
|
||||
supports such interaction) an opportunity to receive the Corresponding
|
||||
Source of your version by providing access to the Corresponding Source
|
||||
from a network server at no charge, through some standard or customary
|
||||
means of facilitating copying of software. This Corresponding Source
|
||||
shall include the Corresponding Source for any work covered by version 3
|
||||
of the GNU General Public License that is incorporated pursuant to the
|
||||
following paragraph.
|
||||
|
||||
Notwithstanding any other provision of this License, you have
|
||||
permission to link or combine any covered work with a work licensed
|
||||
under version 3 of the GNU General Public License into a single
|
||||
combined work, and to convey the resulting work. The terms of this
|
||||
License will continue to apply to the part which is the covered work,
|
||||
but the work with which it is combined will remain governed by version
|
||||
3 of the GNU General Public License.
|
||||
|
||||
14. Revised Versions of this License.
|
||||
|
||||
The Free Software Foundation may publish revised and/or new versions of
|
||||
the GNU Affero General Public License from time to time. Such new versions
|
||||
will be similar in spirit to the present version, but may differ in detail to
|
||||
address new problems or concerns.
|
||||
|
||||
Each version is given a distinguishing version number. If the
|
||||
Program specifies that a certain numbered version of the GNU Affero General
|
||||
Public License "or any later version" applies to it, you have the
|
||||
option of following the terms and conditions either of that numbered
|
||||
version or of any later version published by the Free Software
|
||||
Foundation. If the Program does not specify a version number of the
|
||||
GNU Affero General Public License, you may choose any version ever published
|
||||
by the Free Software Foundation.
|
||||
|
||||
If the Program specifies that a proxy can decide which future
|
||||
versions of the GNU Affero General Public License can be used, that proxy's
|
||||
public statement of acceptance of a version permanently authorizes you
|
||||
to choose that version for the Program.
|
||||
|
||||
Later license versions may give you additional or different
|
||||
permissions. However, no additional obligations are imposed on any
|
||||
author or copyright holder as a result of your choosing to follow a
|
||||
later version.
|
||||
|
||||
15. Disclaimer of Warranty.
|
||||
|
||||
THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
|
||||
APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
|
||||
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
|
||||
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
|
||||
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
|
||||
IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
|
||||
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
|
||||
|
||||
16. Limitation of Liability.
|
||||
|
||||
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
|
||||
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
|
||||
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
|
||||
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
|
||||
USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
|
||||
DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
|
||||
PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
|
||||
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
|
||||
SUCH DAMAGES.
|
||||
|
||||
17. Interpretation of Sections 15 and 16.
|
||||
|
||||
If the disclaimer of warranty and limitation of liability provided
|
||||
above cannot be given local legal effect according to their terms,
|
||||
reviewing courts shall apply local law that most closely approximates
|
||||
an absolute waiver of all civil liability in connection with the
|
||||
Program, unless a warranty or assumption of liability accompanies a
|
||||
copy of the Program in return for a fee.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
How to Apply These Terms to Your New Programs
|
||||
|
||||
If you develop a new program, and you want it to be of the greatest
|
||||
possible use to the public, the best way to achieve this is to make it
|
||||
free software which everyone can redistribute and change under these terms.
|
||||
|
||||
To do so, attach the following notices to the program. It is safest
|
||||
to attach them to the start of each source file to most effectively
|
||||
state the exclusion of warranty; and each file should have at least
|
||||
the "copyright" line and a pointer to where the full notice is found.
|
||||
|
||||
<one line to give the program's name and a brief idea of what it does.>
|
||||
Copyright (C) <year> <name of author>
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as published
|
||||
by the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
Also add information on how to contact you by electronic and paper mail.
|
||||
|
||||
If your software can interact with users remotely through a computer
|
||||
network, you should also make sure that it provides a way for users to
|
||||
get its source. For example, if your program is a web application, its
|
||||
interface could display a "Source" link that leads users to an archive
|
||||
of the code. There are many ways you could offer source, and different
|
||||
solutions will be better for different programs; see section 13 for the
|
||||
specific requirements.
|
||||
|
||||
You should also get your employer (if you work as a programmer) or school,
|
||||
if any, to sign a "copyright disclaimer" for the program, if necessary.
|
||||
For more information on this, and how to apply and follow the GNU AGPL, see
|
||||
<https://www.gnu.org/licenses/>.
|
||||
|
|
@ -0,0 +1,311 @@
|
|||
# TTS 语音合成插件
|
||||
|
||||
MaiBot 的文本转语音插件,支持多种 TTS 后端。
|
||||
|
||||
## 支持的后端
|
||||
|
||||
| 后端 | 说明 | 适用场景 |
|
||||
|------|------|----------|
|
||||
| AI Voice | MaiCore 内置,无需配置 | 仅群聊 |
|
||||
| GSV2P | 云端 API,需要 Token | 群聊/私聊 |
|
||||
| GPT-SoVITS | 本地服务,需自行部署 | 群聊/私聊 |
|
||||
| 豆包语音 | 火山引擎云服务,高质量 | 群聊/私聊 |
|
||||
| CosyVoice | 阿里云 CosyVoice3,支持方言和声音克隆 | 群聊/私聊 |
|
||||
| ComfyUI | 本地 ComfyUI 工作流 API(MLX Qwen3-TTS VoiceClone) | 群聊/私聊 |
|
||||
|
||||
## 安装
|
||||
|
||||
```bash
|
||||
pip install aiohttp gradio_client
|
||||
```
|
||||
|
||||
## 配置
|
||||
|
||||
编辑 `config.toml`,设置默认后端:
|
||||
|
||||
```toml
|
||||
[general]
|
||||
default_backend = "cosyvoice" # 可选:ai_voice / gsv2p / gpt_sovits / doubao / cosyvoice / comfyui
|
||||
audio_output_dir = "" # 音频输出目录,留空使用项目根目录
|
||||
use_base64_audio = false # 是否使用base64发送(备选方案)
|
||||
split_sentences = true # 是否分段发送语音(长文本逐句发送)
|
||||
split_delay = 0.3 # 分段发送间隔时间(秒)
|
||||
send_error_messages = true # 是否发送错误提示消息(false=静默失败)
|
||||
```
|
||||
|
||||
### Docker环境配置说明
|
||||
|
||||
**问题:** Docker环境中可能遇到音频上传失败或文件路径识别错误(如`识别URL失败`)
|
||||
|
||||
**解决方案(按推荐顺序):**
|
||||
|
||||
#### 方案1:使用相对路径(推荐)
|
||||
|
||||
```toml
|
||||
[general]
|
||||
audio_output_dir = "" # 留空,默认使用项目根目录
|
||||
```
|
||||
|
||||
音频文件将保存在项目根目录,OneBot/NapCat可以正确识别相对路径。
|
||||
|
||||
#### 方案2:自定义输出目录
|
||||
|
||||
```toml
|
||||
[general]
|
||||
audio_output_dir = "data/tts_audio" # 相对路径,相对于项目根目录
|
||||
# 或
|
||||
audio_output_dir = "/app/data/audio" # 绝对路径
|
||||
```
|
||||
|
||||
#### 方案3:使用base64编码(备选)
|
||||
|
||||
如果路径方案都不生效,可启用base64发送:
|
||||
|
||||
```toml
|
||||
[general]
|
||||
use_base64_audio = true # 使用base64编码发送(会增加约33%数据大小)
|
||||
```
|
||||
|
||||
### 豆包语音配置
|
||||
|
||||
```toml
|
||||
[doubao]
|
||||
app_id = "你的APP_ID"
|
||||
access_key = "你的ACCESS_KEY"
|
||||
resource_id = "seed-tts-2.0"
|
||||
default_voice = "zh_female_vv_uranus_bigtts"
|
||||
```
|
||||
|
||||
**预置音色:**
|
||||
|
||||
| 音色名称 | voice_type |
|
||||
|----------|------------|
|
||||
| vivi 2.0 | zh_female_vv_uranus_bigtts |
|
||||
| 大壹 | zh_male_dayi_saturn_bigtts |
|
||||
| 黑猫侦探社咪仔 | zh_female_mizai_saturn_bigtts |
|
||||
|
||||
**复刻音色:** 将 `resource_id` 改为 `seed-icl-2.0`,`default_voice` 填音色 ID(如 `S_xxxxxx`)
|
||||
|
||||
凭证获取:[火山引擎控制台](https://console.volcengine.com/speech/service/8)
|
||||
|
||||
### GSV2P 配置
|
||||
|
||||
```toml
|
||||
[gsv2p]
|
||||
api_token = "你的Token"
|
||||
default_voice = "原神-中文-派蒙_ZH"
|
||||
```
|
||||
|
||||
Token 获取:[https://tts.acgnai.top](https://tts.acgnai.top)
|
||||
|
||||
### AI Voice 配置
|
||||
|
||||
```toml
|
||||
[ai_voice]
|
||||
default_character = "温柔妹妹"
|
||||
```
|
||||
|
||||
可用音色:小新、猴哥、妲己、酥心御姐、温柔妹妹、邻家小妹 等 22 种
|
||||
|
||||
### GPT-SoVITS 配置
|
||||
|
||||
**支持两种配置格式:**
|
||||
|
||||
#### 格式1:数组格式(推荐,WebUI 友好)
|
||||
|
||||
```toml
|
||||
[gpt_sovits]
|
||||
server = "http://127.0.0.1:9880"
|
||||
|
||||
[[gpt_sovits.styles]]
|
||||
name = "default"
|
||||
refer_wav = "/path/to/reference.wav"
|
||||
prompt_text = "参考文本"
|
||||
prompt_language = "zh"
|
||||
gpt_weights = "/path/to/model.ckpt" # 可选:动态模型切换
|
||||
sovits_weights = "/path/to/model.pth" # 可选:动态模型切换
|
||||
|
||||
[[gpt_sovits.styles]]
|
||||
name = "happy"
|
||||
refer_wav = "/path/to/happy.wav"
|
||||
prompt_text = "开心的参考文本"
|
||||
prompt_language = "zh"
|
||||
```
|
||||
|
||||
#### 格式2:字典格式(兼容旧版)
|
||||
|
||||
```toml
|
||||
[gpt_sovits]
|
||||
server = "http://127.0.0.1:9880"
|
||||
|
||||
[gpt_sovits.styles.default]
|
||||
refer_wav = "/path/to/reference.wav"
|
||||
prompt_text = "参考文本"
|
||||
prompt_language = "zh"
|
||||
gpt_weights = "/path/to/model.ckpt"
|
||||
sovits_weights = "/path/to/model.pth"
|
||||
```
|
||||
|
||||
> **提示:** 插件会自动识别并兼容两种格式,推荐使用数组格式以获得更好的 WebUI 支持。
|
||||
|
||||
### CosyVoice 配置
|
||||
|
||||
```toml
|
||||
[cosyvoice]
|
||||
gradio_url = "https://funaudiollm-fun-cosyvoice3-0-5b.ms.show/"
|
||||
default_mode = "3s极速复刻" # 或 "自然语言控制"
|
||||
default_instruct = "You are a helpful assistant. 请用广东话表达。<|endofprompt|>" # 只有自然语言控制模式才会生效,3s极速复刻模式下不生效
|
||||
reference_audio = "/path/to/ref.wav" # 参考音频路径
|
||||
prompt_text = "参考音频对应的文本" # 参考音频的对应文本
|
||||
timeout = 300 # API超时(秒)
|
||||
```
|
||||
|
||||
**支持的方言/情感/语速:**
|
||||
|
||||
| 类型 | 可用选项 |
|
||||
|------|----------|
|
||||
| 方言 | 广东话、东北话、四川话、上海话、闽南话、山东话、陕西话、湖南话等17种 |
|
||||
| 情感 | 开心、伤心、生气 |
|
||||
| 语速 | 慢速、快速 |
|
||||
| 音量 | 大声、小声 |
|
||||
| 特殊风格 | 小猪佩奇、机器人 |
|
||||
|
||||
**推理模式:**
|
||||
- `3s极速复刻`:需要提供参考音频进行声音克隆
|
||||
- `自然语言控制`:通过指令控制方言、情感、语速等
|
||||
|
||||
## 使用方法
|
||||
|
||||
### 命令触发
|
||||
|
||||
```
|
||||
/tts 你好世界 # 使用默认后端
|
||||
/tts 今天天气不错 小新 # 指定音色
|
||||
/gsv2p 你好世界 # 使用 GSV2P
|
||||
/doubao 你好世界 # 使用豆包
|
||||
/cosyvoice 你好世界 四川话 # 使用 CosyVoice,四川话
|
||||
/comfyui 你好世界 -v default # 使用 ComfyUI 本地工作流(MLX VoiceClone)
|
||||
```
|
||||
|
||||
## ComfyUI 后端配置
|
||||
|
||||
该后端通过 ComfyUI 的 HTTP API 执行工作流(`/prompt` -> `/history` -> `/view`),并用 `LoadAudio` 从 ComfyUI 的 `input` 目录读取参考音频。
|
||||
|
||||
```toml
|
||||
[comfyui]
|
||||
server = "http://127.0.0.1:8188"
|
||||
input_dir = "/Users/xenon/Downloads/seiun_tts/ComfyUI/ComfyUI/input"
|
||||
timeout = 120
|
||||
audio_quality = "128k" # SaveAudioMP3: V0/128k/320k
|
||||
mlx_python = "/Users/xenon/Downloads/seiun_tts/qwen3-tts-apple-silicon/.venv/bin/python"
|
||||
mlx_cli = "/Users/xenon/Downloads/seiun_tts/qwen3-tts-apple-silicon/mlx_voice_clone_cli.py"
|
||||
default_style = "default"
|
||||
|
||||
[[comfyui.styles]]
|
||||
name = "default"
|
||||
refer_wav = "/path/to/ref.wav"
|
||||
prompt_text = "参考音频逐字稿"
|
||||
language = "Auto" # 可选: Auto/Chinese/English/Japanese...
|
||||
model_choice = "1.7B"
|
||||
precision = "bf16"
|
||||
seed = 0
|
||||
max_new_tokens = 2048
|
||||
top_p = 0.8
|
||||
top_k = 20
|
||||
temperature = 1.0
|
||||
repetition_penalty = 1.05
|
||||
```
|
||||
|
||||
### 自动触发
|
||||
|
||||
LLM 判断需要语音回复时会自动触发,可通过概率控制:
|
||||
|
||||
```toml
|
||||
[probability]
|
||||
enabled = false # 默认关闭,每次都触发语音
|
||||
base_probability = 0.3 # 启用时 30% 概率触发
|
||||
```
|
||||
|
||||
### 智能分割插件支持
|
||||
|
||||
本插件已适配智能分割插件,支持使用 `|||SPLIT|||` 分隔符进行精确分段:
|
||||
|
||||
- **优先级**:智能分割标记 > 自动句子分割 > 单句发送
|
||||
- **使用方式**:智能分割插件会在适当位置插入 `|||SPLIT|||` 标记,本插件自动识别并按标记分段发送
|
||||
- **示例**:`今天天气不错|||SPLIT|||适合出去玩|||SPLIT|||你觉得呢` 会分成三段语音依次发送
|
||||
|
||||
## 项目结构
|
||||
|
||||
```
|
||||
tts_voice_plugin/
|
||||
├── plugin.py # 插件入口
|
||||
├── config.toml # 配置文件
|
||||
├── backends/ # 后端实现
|
||||
│ ├── ai_voice.py
|
||||
│ ├── gsv2p.py
|
||||
│ ├── gpt_sovits.py
|
||||
│ ├── doubao.py
|
||||
│ └── cosyvoice.py
|
||||
└── utils/ # 工具函数
|
||||
```
|
||||
|
||||
## 常见问题
|
||||
|
||||
**Q: Docker环境中提示"文件处理失败 识别URL失败"?**
|
||||
A: 留空 `audio_output_dir` 配置项,插件将使用项目根目录保存音频(相对路径)。如仍有问题,可设置 `use_base64_audio = true` 使用base64编码发送。
|
||||
|
||||
**Q: AI Voice 提示"仅支持群聊"?**
|
||||
A: AI Voice 只能在群聊使用,私聊会自动切换到其他后端。
|
||||
|
||||
**Q: 豆包语音怎么获取凭证?**
|
||||
A: 登录火山引擎控制台,开通语音合成服务获取。
|
||||
|
||||
**Q: 文本太长被截断?**
|
||||
A: 修改 `config.toml` 中 `max_text_length = 1000`
|
||||
|
||||
**Q: 语音合成失败时不想让Bot发送错误消息?**
|
||||
A: 设置 `send_error_messages = false`,语音合成失败时将静默处理,不向用户发送错误提示。
|
||||
|
||||
## 更新日志
|
||||
|
||||
### v3.2.3
|
||||
- 修复豆包语音 WAV 流式响应合并问题(正确处理 LIST/INFO 元数据块和多 header 情况)
|
||||
- 默认后端改为 CosyVoice(更稳定的声音克隆体验)
|
||||
- 默认关闭概率控制(每次触发都生成语音,更可预期的行为)
|
||||
- 优化 LLM 长度约束提示(利用"近因效应"提高遵守率)
|
||||
- 优化 action 记录格式,帮助 planner 避免重复执行
|
||||
- GSV2P/豆包音频格式默认改为 WAV(更好的兼容性)
|
||||
- CosyVoice 默认模式改为 3s 极速复刻(更快响应)
|
||||
- 更新默认超时配置(CosyVoice 300s, GSV2P 120s)
|
||||
|
||||
### v3.2.2
|
||||
- 适配智能分割插件(支持 `|||SPLIT|||` 分隔符精确分段)
|
||||
- GPT-SoVITS 支持数组格式配置(WebUI 友好,向后兼容字典格式)
|
||||
- 修复豆包语音音色信息显示乱码问题
|
||||
- 优化配置文件注释,更简洁清晰
|
||||
- 优化分段发送逻辑优先级(智能分割 > 自动分割 > 单句)
|
||||
- 禁用 Python 字节码生成(保持目录干净)
|
||||
- 添加插件 ID 标识字段
|
||||
|
||||
### v3.2.1
|
||||
- 新增 `send_error_messages` 配置项(可选择关闭错误提示消息)
|
||||
- 统一错误消息处理逻辑(通过 `_send_error` 方法)
|
||||
|
||||
### v3.2.0
|
||||
- 新增 CosyVoice 后端(阿里云 ModelScope,支持 17 种方言、3 秒声音克隆)
|
||||
- 新增分段发送功能(长文本自动分割逐句发送)
|
||||
- GPT-SoVITS 支持动态模型切换(在风格配置中指定 gpt_weights/sovits_weights)
|
||||
- GSV2P 后端新增重试机制(5 次重试,3 秒间隔)
|
||||
- 新增 `/cosyvoice` 命令
|
||||
- 新增 gradio_client 依赖
|
||||
|
||||
### v3.1.0
|
||||
- 新增豆包语音后端(火山引擎云服务)
|
||||
- 重构为模块化架构
|
||||
- HTTP Session 复用优化
|
||||
|
||||
## 信息
|
||||
|
||||
- 版本:3.2.3
|
||||
- 作者:靓仔
|
||||
- 许可:AGPL-v3.0
|
||||
|
|
@ -0,0 +1,235 @@
|
|||
{
|
||||
"manifest_version": 1,
|
||||
"name": "统一TTS语音合成插件",
|
||||
"version": "3.2.3",
|
||||
"description": "统一TTS语音合成插件,整合AI Voice、GSV2P、GPT-SoVITS、豆包语音、CosyVoice五种后端引擎,提供灵活的语音合成能力。",
|
||||
"author": {
|
||||
"name": "靓仔",
|
||||
"url": "https://github.com/xuqian13"
|
||||
},
|
||||
"license": "AGPL-v3.0",
|
||||
"homepage_url": "",
|
||||
"repository_url": "https://github.com/xuqian13/tts_voice_plugin",
|
||||
"keywords": [
|
||||
"TTS",
|
||||
"语音合成",
|
||||
"文本转语音",
|
||||
"AI语音",
|
||||
"GSV2P",
|
||||
"GPT-SoVITS",
|
||||
"豆包",
|
||||
"CosyVoice",
|
||||
"火山引擎",
|
||||
"多后端",
|
||||
"语音",
|
||||
"朗读",
|
||||
"音色",
|
||||
"语音播报",
|
||||
"方言",
|
||||
"声音克隆",
|
||||
"MaiCore"
|
||||
],
|
||||
"categories": [
|
||||
"语音",
|
||||
"AI",
|
||||
"聊天增强",
|
||||
"娱乐",
|
||||
"Utility",
|
||||
"Communication",
|
||||
"Accessibility"
|
||||
],
|
||||
"host_application": {
|
||||
"min_version": "0.12.0"
|
||||
},
|
||||
"default_locale": "zh-CN",
|
||||
"plugin_info": {
|
||||
"is_built_in": false,
|
||||
"plugin_type": "general",
|
||||
"components": [
|
||||
{
|
||||
"type": "action",
|
||||
"name": "unified_tts_action",
|
||||
"description": "统一TTS语音合成Action,支持四种后端引擎智能切换,LLM自主判断触发"
|
||||
},
|
||||
{
|
||||
"type": "command",
|
||||
"name": "unified_tts_command",
|
||||
"description": "统一TTS命令,支持/tts、/voice、/gsv2p、/doubao多种命令格式,灵活指定后端和音色"
|
||||
}
|
||||
],
|
||||
"features": [
|
||||
"支持五种TTS后端:AI Voice、GSV2P、GPT-SoVITS、豆包语音、CosyVoice",
|
||||
"AI Voice: MaiCore内置,简单快速,22+预设音色",
|
||||
"GSV2P: 云端API,高质量合成,丰富的调节参数",
|
||||
"GPT-SoVITS: 本地服务,高度定制化,多风格支持",
|
||||
"豆包语音: 字节跳动云服务,支持复刻音色和情感控制",
|
||||
"CosyVoice: 阿里云语音合成,支持17种方言、3秒声音克隆、情感控制",
|
||||
"模块化架构,后端独立实现,易于扩展",
|
||||
"HTTP Session复用,提升性能",
|
||||
"临时文件自动清理,避免并发冲突",
|
||||
"智能触发模式(LLM自主判断)和手动命令模式",
|
||||
"概率控制机制,避免语音回复过于频繁",
|
||||
"智能语言检测(中文/英文/日文)",
|
||||
"文本自动清理和网络用语转换",
|
||||
"完善的错误处理和重试机制",
|
||||
"灵活的配置系统,支持各后端独立配置"
|
||||
],
|
||||
"dependencies": {
|
||||
"python": [
|
||||
"aiohttp",
|
||||
"gradio_client"
|
||||
],
|
||||
"system": [],
|
||||
"plugins": []
|
||||
},
|
||||
"backend_info": {
|
||||
"ai_voice": {
|
||||
"provider": "MaiCore内置",
|
||||
"endpoint": "AI_VOICE_SEND命令",
|
||||
"authentication": "无需认证",
|
||||
"limitations": "仅支持群聊使用",
|
||||
"voices": "22+预设音色(小新、妲己、酥心御姐等)"
|
||||
},
|
||||
"gsv2p": {
|
||||
"provider": "GSV2P云服务",
|
||||
"endpoint": "https://gsv2p.acgnai.top/v1/audio/speech",
|
||||
"authentication": "需要API Token",
|
||||
"limitations": "API调用限制",
|
||||
"features": "高质量合成、多语言支持、丰富参数调节"
|
||||
},
|
||||
"gpt_sovits": {
|
||||
"provider": "本地GPT-SoVITS服务",
|
||||
"endpoint": "http://127.0.0.1:9880",
|
||||
"authentication": "无需认证",
|
||||
"limitations": "需要本地部署服务",
|
||||
"features": "高度定制化、多风格支持、模型权重切换"
|
||||
},
|
||||
"doubao": {
|
||||
"provider": "字节跳动火山引擎",
|
||||
"endpoint": "https://openspeech.bytedance.com/api/v3/tts/unidirectional",
|
||||
"authentication": "需要app_id、access_key、resource_id",
|
||||
"limitations": "需要火山引擎账号",
|
||||
"features": "快速高质量、支持复刻音色、情感语气控制"
|
||||
},
|
||||
"cosyvoice": {
|
||||
"provider": "阿里云 CosyVoice",
|
||||
"endpoint": "ModelScope Gradio API",
|
||||
"authentication": "无需认证(公开Gradio接口)",
|
||||
"limitations": "依赖ModelScope服务可用性",
|
||||
"features": "3秒声音克隆、17种方言支持、情感语速控制、自然语言指令"
|
||||
}
|
||||
}
|
||||
},
|
||||
"configuration": {
|
||||
"config_file": "config.toml",
|
||||
"config_template": "config.toml.example",
|
||||
"auto_generate": true,
|
||||
"sections": [
|
||||
{
|
||||
"name": "plugin",
|
||||
"description": "插件基本配置"
|
||||
},
|
||||
{
|
||||
"name": "general",
|
||||
"description": "通用设置(默认后端、超时、文本长度等)"
|
||||
},
|
||||
{
|
||||
"name": "components",
|
||||
"description": "组件启用控制"
|
||||
},
|
||||
{
|
||||
"name": "probability",
|
||||
"description": "概率控制配置(避免语音回复过于频繁)"
|
||||
},
|
||||
{
|
||||
"name": "ai_voice",
|
||||
"description": "AI Voice后端配置(音色映射等)"
|
||||
},
|
||||
{
|
||||
"name": "gsv2p",
|
||||
"description": "GSV2P后端配置(API地址、Token、参数等)"
|
||||
},
|
||||
{
|
||||
"name": "gpt_sovits",
|
||||
"description": "GPT-SoVITS后端配置(服务地址、风格配置等)"
|
||||
},
|
||||
{
|
||||
"name": "doubao",
|
||||
"description": "豆包语音后端配置(火山引擎认证、音色、情感等)"
|
||||
},
|
||||
{
|
||||
"name": "cosyvoice",
|
||||
"description": "CosyVoice后端配置(Gradio URL、模式、方言等)"
|
||||
}
|
||||
]
|
||||
},
|
||||
"usage_examples": [
|
||||
{
|
||||
"type": "action",
|
||||
"backend": "auto",
|
||||
"description": "LLM自动触发语音回复",
|
||||
"example": "用户:请用语音说\"你好世界\"\n机器人:[使用默认后端自动生成语音文件并发送]"
|
||||
},
|
||||
{
|
||||
"type": "command",
|
||||
"backend": "ai_voice",
|
||||
"description": "手动命令使用AI Voice",
|
||||
"example": "/tts 你好世界 小新"
|
||||
},
|
||||
{
|
||||
"type": "command",
|
||||
"backend": "gsv2p",
|
||||
"description": "手动命令使用GSV2P",
|
||||
"example": "/gsv2p 今天天气不错"
|
||||
},
|
||||
{
|
||||
"type": "command",
|
||||
"backend": "doubao",
|
||||
"description": "手动命令使用豆包语音",
|
||||
"example": "/doubao 你好世界"
|
||||
},
|
||||
{
|
||||
"type": "command",
|
||||
"backend": "gpt_sovits",
|
||||
"description": "手动命令使用GPT-SoVITS",
|
||||
"example": "/tts 测试一下 default gpt_sovits"
|
||||
},
|
||||
{
|
||||
"type": "command",
|
||||
"backend": "cosyvoice",
|
||||
"description": "手动命令使用CosyVoice",
|
||||
"example": "/cosyvoice 你好世界 四川话"
|
||||
},
|
||||
{
|
||||
"type": "command",
|
||||
"backend": "auto",
|
||||
"description": "使用默认后端",
|
||||
"example": "/tts 你好世界"
|
||||
}
|
||||
],
|
||||
"migration_info": {
|
||||
"from_plugins": [
|
||||
"ai_voice_plugin (v1.0.0)",
|
||||
"gsv2p_tts_plugin (v1.0.0)",
|
||||
"tts_voice_plugin (v2.0.0)",
|
||||
"tts_voice_plugin (v3.0.0)"
|
||||
],
|
||||
"migration_notes": [
|
||||
"本插件整合了ai_voice_plugin、gsv2p_tts_plugin和旧版tts_voice_plugin的所有功能",
|
||||
"v3.2.2适配智能分割插件(支持|||SPLIT|||分隔符精确分段)",
|
||||
"v3.2.2支持GPT-SoVITS数组格式配置(WebUI友好,向后兼容字典格式)",
|
||||
"v3.2.2修复豆包语音音色信息显示乱码问题",
|
||||
"v3.2.2优化配置文件注释,更简洁清晰",
|
||||
"v3.2.0新增CosyVoice后端支持(阿里云语音合成,支持17种方言和3秒声音克隆)",
|
||||
"v3.1.0新增豆包语音后端支持",
|
||||
"v3.1.0重构为模块化架构,提升代码可维护性",
|
||||
"配置文件需要重新生成,原配置需手动迁移",
|
||||
"建议备份旧插件配置后再迁移",
|
||||
"AI Voice音色映射保持兼容",
|
||||
"GSV2P API配置需重新填写Token",
|
||||
"GPT-SoVITS风格配置需要重新设置",
|
||||
"新增config.toml.example模板文件"
|
||||
]
|
||||
},
|
||||
"id": "tts_voice_plugin"
|
||||
}
|
||||
|
|
@ -0,0 +1,38 @@
|
|||
"""
|
||||
TTS后端模块
|
||||
"""
|
||||
|
||||
import sys
|
||||
sys.dont_write_bytecode = True
|
||||
|
||||
from .base import TTSBackendBase, TTSBackendRegistry, TTSResult
|
||||
from .ai_voice import AIVoiceBackend
|
||||
from .gsv2p import GSV2PBackend
|
||||
from .gpt_sovits import GPTSoVITSBackend
|
||||
from .doubao import DoubaoBackend
|
||||
from .cosyvoice import CosyVoiceBackend
|
||||
from .comfyui import ComfyUIBackend, ComfyUIVoiceCloneBackend, ComfyUICustomVoiceBackend
|
||||
|
||||
# 注册后端
|
||||
TTSBackendRegistry.register("ai_voice", AIVoiceBackend)
|
||||
TTSBackendRegistry.register("gsv2p", GSV2PBackend)
|
||||
TTSBackendRegistry.register("gpt_sovits", GPTSoVITSBackend)
|
||||
TTSBackendRegistry.register("doubao", DoubaoBackend)
|
||||
TTSBackendRegistry.register("cosyvoice", CosyVoiceBackend)
|
||||
TTSBackendRegistry.register("comfyui", ComfyUIBackend)
|
||||
TTSBackendRegistry.register("comfyui_voiceclone", ComfyUIVoiceCloneBackend)
|
||||
TTSBackendRegistry.register("comfyui_customvoice", ComfyUICustomVoiceBackend)
|
||||
|
||||
__all__ = [
|
||||
"TTSBackendBase",
|
||||
"TTSBackendRegistry",
|
||||
"TTSResult",
|
||||
"AIVoiceBackend",
|
||||
"GSV2PBackend",
|
||||
"GPTSoVITSBackend",
|
||||
"DoubaoBackend",
|
||||
"CosyVoiceBackend",
|
||||
"ComfyUIBackend",
|
||||
"ComfyUIVoiceCloneBackend",
|
||||
"ComfyUICustomVoiceBackend",
|
||||
]
|
||||
|
|
@ -0,0 +1,133 @@
|
|||
"""
|
||||
AI Voice 后端实现
|
||||
使用 MaiCore 内置的 AI 语音功能
|
||||
"""
|
||||
|
||||
from typing import Optional, Callable, Dict
|
||||
from .base import TTSBackendBase, TTSResult
|
||||
from ..utils.text import TTSTextUtils
|
||||
from ..config_keys import ConfigKeys
|
||||
from src.common.logger import get_logger
|
||||
|
||||
logger = get_logger("tts_ai_voice")
|
||||
|
||||
# AI Voice 音色映射表
|
||||
AI_VOICE_ALIAS_MAP = {
|
||||
"小新": "lucy-voice-laibixiaoxin",
|
||||
"猴哥": "lucy-voice-houge",
|
||||
"四郎": "lucy-voice-silang",
|
||||
"东北老妹儿": "lucy-voice-guangdong-f1",
|
||||
"广西大表哥": "lucy-voice-guangxi-m1",
|
||||
"妲己": "lucy-voice-daji",
|
||||
"霸道总裁": "lucy-voice-lizeyan",
|
||||
"酥心御姐": "lucy-voice-suxinjiejie",
|
||||
"说书先生": "lucy-voice-m8",
|
||||
"憨憨小弟": "lucy-voice-male1",
|
||||
"憨厚老哥": "lucy-voice-male3",
|
||||
"吕布": "lucy-voice-lvbu",
|
||||
"元气少女": "lucy-voice-xueling",
|
||||
"文艺少女": "lucy-voice-f37",
|
||||
"磁性大叔": "lucy-voice-male2",
|
||||
"邻家小妹": "lucy-voice-female1",
|
||||
"低沉男声": "lucy-voice-m14",
|
||||
"傲娇少女": "lucy-voice-f38",
|
||||
"爹系男友": "lucy-voice-m101",
|
||||
"暖心姐姐": "lucy-voice-female2",
|
||||
"温柔妹妹": "lucy-voice-f36",
|
||||
"书香少女": "lucy-voice-f34"
|
||||
}
|
||||
|
||||
|
||||
class AIVoiceBackend(TTSBackendBase):
|
||||
"""
|
||||
AI Voice 后端
|
||||
|
||||
使用 MaiCore 内置的 AI 语音功能
|
||||
注意:仅支持群聊环境
|
||||
"""
|
||||
|
||||
backend_name = "ai_voice"
|
||||
backend_description = "MaiCore内置AI语音(仅群聊)"
|
||||
support_private_chat = False # 不支持私聊
|
||||
default_audio_format = "" # AI Voice不需要音频格式
|
||||
|
||||
def __init__(self, config_getter, log_prefix: str = ""):
|
||||
super().__init__(config_getter, log_prefix)
|
||||
self._send_command = None # 由外部注入
|
||||
|
||||
def set_send_command(self, send_command_func: Callable) -> None:
|
||||
"""设置发送命令的函数(由Action/Command注入)"""
|
||||
self._send_command = send_command_func
|
||||
|
||||
def get_default_voice(self) -> str:
|
||||
"""获取默认音色"""
|
||||
return self.get_config(ConfigKeys.AI_VOICE_DEFAULT_CHARACTER, "温柔妹妹")
|
||||
|
||||
def resolve_voice(self, voice: Optional[str]) -> str:
|
||||
"""解析音色别名"""
|
||||
alias_map: Dict[str, str] = self.get_config(
|
||||
ConfigKeys.AI_VOICE_ALIAS_MAP,
|
||||
AI_VOICE_ALIAS_MAP
|
||||
)
|
||||
default_voice = self.get_default_voice()
|
||||
return TTSTextUtils.resolve_voice_alias(
|
||||
voice,
|
||||
alias_map,
|
||||
default_voice,
|
||||
prefix="lucy-voice-"
|
||||
)
|
||||
|
||||
async def execute(
|
||||
self,
|
||||
text: str,
|
||||
voice: Optional[str] = None,
|
||||
**kwargs
|
||||
) -> TTSResult:
|
||||
"""
|
||||
执行AI Voice语音合成
|
||||
|
||||
Args:
|
||||
text: 待转换的文本
|
||||
voice: 音色名称或别名
|
||||
|
||||
Returns:
|
||||
TTSResult
|
||||
"""
|
||||
if not self._send_command:
|
||||
return TTSResult(
|
||||
success=False,
|
||||
message="AI Voice后端未正确初始化(缺少send_command)",
|
||||
backend_name=self.backend_name
|
||||
)
|
||||
|
||||
# 解析音色
|
||||
character = self.resolve_voice(voice)
|
||||
|
||||
try:
|
||||
success = await self._send_command(
|
||||
command_name="AI_VOICE_SEND",
|
||||
args={"text": text, "character": character},
|
||||
storage_message=False
|
||||
)
|
||||
|
||||
if success:
|
||||
logger.info(f"{self.log_prefix} AI语音发送成功 (音色: {character})")
|
||||
return TTSResult(
|
||||
success=True,
|
||||
message=f"成功发送AI语音 (音色: {character})",
|
||||
backend_name=self.backend_name
|
||||
)
|
||||
else:
|
||||
return TTSResult(
|
||||
success=False,
|
||||
message="AI语音命令发送失败",
|
||||
backend_name=self.backend_name
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"{self.log_prefix} AI语音执行错误: {e}")
|
||||
return TTSResult(
|
||||
success=False,
|
||||
message=f"AI语音执行错误: {e}",
|
||||
backend_name=self.backend_name
|
||||
)
|
||||
|
|
@ -0,0 +1,239 @@
|
|||
"""
|
||||
TTS后端抽象基类和注册表
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass
|
||||
from typing import Dict, Type, Optional, Any, Callable, Tuple, Union
|
||||
from src.common.logger import get_logger
|
||||
from ..config_keys import ConfigKeys
|
||||
|
||||
logger = get_logger("tts_backend")
|
||||
|
||||
|
||||
@dataclass
|
||||
class TTSResult:
|
||||
"""TTS执行结果"""
|
||||
success: bool
|
||||
message: str
|
||||
audio_path: Optional[str] = None
|
||||
backend_name: str = ""
|
||||
|
||||
def __iter__(self):
|
||||
"""支持解包为 (success, message)"""
|
||||
return iter((self.success, self.message))
|
||||
|
||||
|
||||
class TTSBackendBase(ABC):
|
||||
"""
|
||||
TTS后端抽象基类
|
||||
|
||||
所有TTS后端必须继承此类并实现 execute 方法
|
||||
"""
|
||||
|
||||
# 后端名称(子类必须覆盖)
|
||||
backend_name: str = "base"
|
||||
|
||||
# 后端描述
|
||||
backend_description: str = "TTS后端基类"
|
||||
|
||||
# 是否支持私聊
|
||||
support_private_chat: bool = True
|
||||
|
||||
# 默认音频格式
|
||||
default_audio_format: str = "mp3"
|
||||
|
||||
def __init__(self, config_getter: Callable[[str, Any], Any], log_prefix: str = ""):
|
||||
"""
|
||||
初始化后端
|
||||
|
||||
Args:
|
||||
config_getter: 配置获取函数,签名为 get_config(key, default)
|
||||
log_prefix: 日志前缀
|
||||
"""
|
||||
self.get_config = config_getter
|
||||
self.log_prefix = log_prefix or f"[{self.backend_name}]"
|
||||
self._send_custom = None
|
||||
|
||||
def set_send_custom(self, send_custom_func: Callable) -> None:
|
||||
"""设置发送自定义消息的函数"""
|
||||
self._send_custom = send_custom_func
|
||||
|
||||
async def send_audio(
|
||||
self,
|
||||
audio_data: bytes,
|
||||
audio_format: str = "mp3",
|
||||
prefix: str = "tts",
|
||||
voice_info: str = ""
|
||||
) -> TTSResult:
|
||||
"""
|
||||
统一的音频发送方法
|
||||
|
||||
Args:
|
||||
audio_data: 音频二进制数据
|
||||
audio_format: 音频格式(如mp3、wav)
|
||||
prefix: 文件名前缀
|
||||
voice_info: 音色信息(用于日志)
|
||||
|
||||
Returns:
|
||||
TTSResult
|
||||
"""
|
||||
from ..utils.file import TTSFileManager
|
||||
|
||||
# 检查是否使用base64发送
|
||||
use_base64 = self.get_config(ConfigKeys.GENERAL_USE_BASE64_AUDIO, False)
|
||||
logger.debug(f"{self.log_prefix} 开始发送音频 (原始大小: {len(audio_data)}字节, 格式: {audio_format})")
|
||||
|
||||
if use_base64:
|
||||
# 使用base64编码发送
|
||||
base64_audio = TTSFileManager.audio_to_base64(audio_data)
|
||||
if not base64_audio:
|
||||
return TTSResult(False, "音频数据转base64失败", backend_name=self.backend_name)
|
||||
|
||||
logger.debug(f"{self.log_prefix} base64编码完成,准备通过send_custom发送")
|
||||
if self._send_custom:
|
||||
await self._send_custom(message_type="voice", content=base64_audio)
|
||||
logger.info(f"{self.log_prefix} 语音已通过send_custom发送 (base64模式, 音频大小: {len(audio_data)}字节)")
|
||||
else:
|
||||
logger.warning(f"{self.log_prefix} send_custom未设置,无法发送语音")
|
||||
return TTSResult(False, "send_custom回调未设置", backend_name=self.backend_name)
|
||||
|
||||
return TTSResult(
|
||||
success=True,
|
||||
message=f"成功发送{self.backend_name}语音{(' ('+voice_info+')') if voice_info else ''}, base64模式",
|
||||
backend_name=self.backend_name
|
||||
)
|
||||
else:
|
||||
# 使用文件路径发送
|
||||
output_dir = self.get_config(ConfigKeys.GENERAL_AUDIO_OUTPUT_DIR, "")
|
||||
audio_path = TTSFileManager.generate_temp_path(
|
||||
prefix=prefix,
|
||||
suffix=f".{audio_format}",
|
||||
output_dir=output_dir
|
||||
)
|
||||
|
||||
if not await TTSFileManager.write_audio_async(audio_path, audio_data):
|
||||
return TTSResult(False, "保存音频文件失败", backend_name=self.backend_name)
|
||||
|
||||
logger.debug(f"{self.log_prefix} 音频文件已保存, 路径: {audio_path}")
|
||||
# 发送语音
|
||||
if self._send_custom:
|
||||
await self._send_custom(message_type="voiceurl", content=audio_path)
|
||||
logger.info(f"{self.log_prefix} 语音已通过send_custom发送 (文件路径模式, 路径: {audio_path})")
|
||||
# 延迟清理临时文件
|
||||
asyncio.create_task(TTSFileManager.cleanup_file_async(audio_path, delay=30))
|
||||
else:
|
||||
logger.warning(f"{self.log_prefix} send_custom未设置,无法发送语音")
|
||||
return TTSResult(False, "send_custom回调未设置", backend_name=self.backend_name)
|
||||
|
||||
return TTSResult(
|
||||
success=True,
|
||||
message=f"成功发送{self.backend_name}语音{(' ('+voice_info+')') if voice_info else ''}",
|
||||
audio_path=audio_path,
|
||||
backend_name=self.backend_name
|
||||
)
|
||||
|
||||
@abstractmethod
|
||||
async def execute(
|
||||
self,
|
||||
text: str,
|
||||
voice: Optional[str] = None,
|
||||
**kwargs
|
||||
) -> TTSResult:
|
||||
"""
|
||||
执行TTS转换
|
||||
|
||||
Args:
|
||||
text: 待转换的文本
|
||||
voice: 音色/风格
|
||||
**kwargs: 其他参数(如emotion等)
|
||||
|
||||
Returns:
|
||||
TTSResult 包含执行结果
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def validate_config(self) -> Tuple[bool, str]:
|
||||
"""
|
||||
验证后端配置是否完整
|
||||
|
||||
Returns:
|
||||
(is_valid, error_message)
|
||||
"""
|
||||
return True, ""
|
||||
|
||||
def get_default_voice(self) -> str:
|
||||
"""获取默认音色"""
|
||||
return ""
|
||||
|
||||
def is_available(self) -> bool:
|
||||
"""检查后端是否可用"""
|
||||
is_valid, _ = self.validate_config()
|
||||
return is_valid
|
||||
|
||||
|
||||
class TTSBackendRegistry:
|
||||
"""
|
||||
TTS后端注册表
|
||||
|
||||
使用策略模式 + 工厂模式管理后端
|
||||
"""
|
||||
|
||||
_backends: Dict[str, Type[TTSBackendBase]] = {}
|
||||
|
||||
@classmethod
|
||||
def register(cls, name: str, backend_class: Type[TTSBackendBase]) -> None:
|
||||
"""
|
||||
注册后端
|
||||
|
||||
Args:
|
||||
name: 后端名称
|
||||
backend_class: 后端类
|
||||
"""
|
||||
cls._backends[name] = backend_class
|
||||
logger.debug(f"注册TTS后端: {name}")
|
||||
|
||||
@classmethod
|
||||
def unregister(cls, name: str) -> None:
|
||||
"""注销后端"""
|
||||
if name in cls._backends:
|
||||
del cls._backends[name]
|
||||
|
||||
@classmethod
|
||||
def get(cls, name: str) -> Optional[Type[TTSBackendBase]]:
|
||||
"""获取后端类"""
|
||||
return cls._backends.get(name)
|
||||
|
||||
@classmethod
|
||||
def create(
|
||||
cls,
|
||||
name: str,
|
||||
config_getter: Callable[[str, Any], Any],
|
||||
log_prefix: str = ""
|
||||
) -> Optional[TTSBackendBase]:
|
||||
"""
|
||||
创建后端实例
|
||||
|
||||
Args:
|
||||
name: 后端名称
|
||||
config_getter: 配置获取函数
|
||||
log_prefix: 日志前缀
|
||||
|
||||
Returns:
|
||||
后端实例或None
|
||||
"""
|
||||
backend_class = cls.get(name)
|
||||
if backend_class:
|
||||
return backend_class(config_getter, log_prefix)
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def list_backends(cls) -> list[str]:
|
||||
"""列出所有已注册的后端名称"""
|
||||
return list(cls._backends.keys())
|
||||
|
||||
@classmethod
|
||||
def is_registered(cls, name: str) -> bool:
|
||||
"""检查后端是否已注册"""
|
||||
return name in cls._backends
|
||||
|
|
@ -0,0 +1,827 @@
|
|||
"""
|
||||
ComfyUI backend (Workflow API).
|
||||
|
||||
This backend calls a fixed ComfyUI prompt graph that:
|
||||
LoadAudio -> MLX_Qwen3TTSVoiceClone -> SaveAudioMP3
|
||||
|
||||
Rationale:
|
||||
- ComfyUI expects API-format "prompt" graphs (not UI workflow json).
|
||||
- For audio inputs, the simplest reliable path is to copy the reference audio into ComfyUI/input
|
||||
and use the built-in LoadAudio node.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import hashlib
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
import uuid
|
||||
from typing import Any, ClassVar, Dict, Optional, Tuple
|
||||
from urllib.parse import urlencode
|
||||
|
||||
from src.common.logger import get_logger
|
||||
from src.plugin_system.apis import generator_api
|
||||
|
||||
from .base import TTSBackendBase, TTSResult
|
||||
from ..config_keys import ConfigKeys
|
||||
from ..utils.file import TTSFileManager
|
||||
from ..utils.session import TTSSessionManager
|
||||
from ..utils.text import TTSTextUtils
|
||||
|
||||
logger = get_logger("tts_comfyui")
|
||||
|
||||
|
||||
LANG_TO_DEMO = {
|
||||
"zh": "Chinese",
|
||||
"ja": "Japanese",
|
||||
"en": "English",
|
||||
}
|
||||
|
||||
|
||||
class ComfyUIBackend(TTSBackendBase):
|
||||
backend_name = "comfyui"
|
||||
backend_description = "ComfyUI 工作流 API(MLX Qwen3-TTS VoiceClone/CustomVoice)"
|
||||
support_private_chat = True
|
||||
default_audio_format = "mp3"
|
||||
|
||||
_ref_cache: ClassVar[Dict[str, str]] = {}
|
||||
_instruct_cache: ClassVar[Dict[str, str]] = {}
|
||||
# If set by subclasses, only these modes are allowed (e.g. {"voice_clone"}).
|
||||
allowed_modes: ClassVar[Optional[set[str]]] = None
|
||||
|
||||
def get_default_voice(self) -> str:
|
||||
return self.get_config(ConfigKeys.COMFYUI_DEFAULT_STYLE, "default")
|
||||
|
||||
def _filter_styles_by_mode(self, styles: Dict[str, Any]) -> Dict[str, Any]:
|
||||
allowed = self.allowed_modes
|
||||
if not allowed:
|
||||
return styles
|
||||
out: Dict[str, Any] = {}
|
||||
for name, st in (styles or {}).items():
|
||||
if not isinstance(st, dict):
|
||||
continue
|
||||
mode = str(st.get("mode") or "voice_clone").strip()
|
||||
if mode in allowed:
|
||||
out[name] = st
|
||||
return out
|
||||
|
||||
def _normalize_styles_config(self, styles_config: Any) -> Dict[str, Any]:
|
||||
# Match GPT-SoVITS backend style schema: list[{name,...}] or dict{name:{...}}
|
||||
if isinstance(styles_config, dict):
|
||||
return styles_config
|
||||
if isinstance(styles_config, list):
|
||||
result = {}
|
||||
for style in styles_config:
|
||||
if isinstance(style, dict) and "name" in style:
|
||||
name = style["name"]
|
||||
result[name] = {k: v for k, v in style.items() if k != "name"}
|
||||
return result
|
||||
return {}
|
||||
|
||||
def _clean_instruct(self, s: str, max_chars: int) -> str:
|
||||
s = (s or "").strip()
|
||||
if not s:
|
||||
return ""
|
||||
|
||||
# Strip common wrappers.
|
||||
s = s.replace("```", "").strip()
|
||||
s = re.sub(r"^instruct\\s*[::]\\s*", "", s, flags=re.IGNORECASE).strip()
|
||||
|
||||
# Prefer first non-empty line.
|
||||
for line in s.splitlines():
|
||||
line = line.strip()
|
||||
if line:
|
||||
s = line
|
||||
break
|
||||
|
||||
# Trim quotes.
|
||||
if len(s) >= 2 and ((s[0] == s[-1] == '"') or (s[0] == s[-1] == "'")):
|
||||
s = s[1:-1].strip()
|
||||
|
||||
if max_chars and len(s) > max_chars:
|
||||
s = s[:max_chars].rstrip()
|
||||
return s
|
||||
|
||||
def _clean_base_tone(self, s: str) -> str:
|
||||
"""
|
||||
Clean a base tone/persona string so it can safely live inside `基调=...`:
|
||||
- single-line
|
||||
- no semicolons (they are field separators)
|
||||
- no '=' (KV separator)
|
||||
"""
|
||||
s = (s or "").strip()
|
||||
if not s:
|
||||
return ""
|
||||
s = s.replace("\r", " ").replace("\n", " ")
|
||||
s = re.sub(r"\\s+", " ", s).strip()
|
||||
# Avoid breaking KV parsing.
|
||||
s = s.replace(";", ",").replace(";", ",")
|
||||
s = s.replace("=", " ").replace("=", " ")
|
||||
return s.strip(" ,,")
|
||||
|
||||
def _attach_base_tone(self, instruct: str, max_chars: int) -> str:
|
||||
"""
|
||||
If configured, prefix inferred instruct with a fixed base tone/persona:
|
||||
`基调=<...>;情绪=...;语速=...;停顿=...`
|
||||
|
||||
Priority when trimming: keep the inferred instruct fields intact if possible.
|
||||
"""
|
||||
base_raw = self.get_config(ConfigKeys.COMFYUI_AUTO_INSTRUCT_BASE_TONE, "") or ""
|
||||
base = self._clean_base_tone(str(base_raw))
|
||||
if not base:
|
||||
return (instruct or "").strip()
|
||||
|
||||
s = (instruct or "").strip()
|
||||
fields = self._parse_instruct_fields(s)
|
||||
if "基调" in fields:
|
||||
return s
|
||||
|
||||
prefix = f"基调={base}"
|
||||
if not s:
|
||||
return prefix[:max_chars].rstrip() if max_chars else prefix
|
||||
|
||||
combined = f"{prefix};{s}"
|
||||
if not max_chars or len(combined) <= max_chars:
|
||||
return combined
|
||||
|
||||
# Too long: try trimming base first, keeping inferred instruct intact.
|
||||
remain = max_chars - len(s) - len(";") - len("基调=")
|
||||
if remain <= 0:
|
||||
# Can't fit base at all; keep instruct (already max_chars-limited upstream).
|
||||
return s[:max_chars].rstrip()
|
||||
base_trim = base[:remain].rstrip(" ,,")
|
||||
return f"基调={base_trim};{s}"
|
||||
|
||||
def _parse_instruct_fields(self, instruct: str) -> Dict[str, str]:
|
||||
"""
|
||||
Parse a 1-line instruct like:
|
||||
情绪=愤怒;语速=很快;停顿=很少;表现=咬牙切齿
|
||||
|
||||
We only *use* a few keys (情绪/语速/停顿/强度/表现...), but keep it generic.
|
||||
"""
|
||||
s = (instruct or "").strip()
|
||||
if not s:
|
||||
return {}
|
||||
|
||||
# Normalize separators (full-width punctuation).
|
||||
s = s.replace(";", ";").replace(":", ":").replace("=", "=")
|
||||
|
||||
# Split by semicolon/comma-like separators.
|
||||
parts = [p.strip() for p in re.split(r"[;]+", s) if p.strip()]
|
||||
out: Dict[str, str] = {}
|
||||
for p in parts:
|
||||
if "=" not in p:
|
||||
continue
|
||||
k, v = p.split("=", 1)
|
||||
k = k.strip()
|
||||
v = v.strip()
|
||||
if not k or not v:
|
||||
continue
|
||||
# Limit key length to avoid garbage.
|
||||
if len(k) > 8:
|
||||
continue
|
||||
out[k] = v
|
||||
return out
|
||||
|
||||
def _map_speed_label(self, label: str) -> Optional[float]:
|
||||
lab = (label or "").strip()
|
||||
m = {
|
||||
"很慢": 0.85,
|
||||
"稍慢": 0.93,
|
||||
"正常": 1.00,
|
||||
"稍快": 1.07,
|
||||
"很快": 1.15,
|
||||
}
|
||||
return m.get(lab)
|
||||
|
||||
def _map_pause_label(self, label: str) -> Optional[float]:
|
||||
lab = (label or "").strip()
|
||||
m = {
|
||||
"很少": 0.6,
|
||||
"自然": 1.0,
|
||||
"稍多": 1.3,
|
||||
"很多": 1.7,
|
||||
}
|
||||
return m.get(lab)
|
||||
|
||||
def _ensure_base_pause_cfg(self, pause_cfg: Dict[str, float]) -> Dict[str, float]:
|
||||
# If caller didn't configure pauses (all zeros), apply a conservative base so "停顿" can take effect.
|
||||
keys = ["pause_linebreak", "period_pause", "comma_pause", "question_pause", "hyphen_pause"]
|
||||
if all(float(pause_cfg.get(k, 0.0) or 0.0) == 0.0 for k in keys):
|
||||
return {
|
||||
**pause_cfg,
|
||||
"pause_linebreak": 0.18,
|
||||
"period_pause": 0.22,
|
||||
"comma_pause": 0.10,
|
||||
"question_pause": 0.20,
|
||||
"hyphen_pause": 0.06,
|
||||
}
|
||||
return pause_cfg
|
||||
|
||||
def _enrich_instruct_for_emotion(self, instruct: str, max_chars: int) -> str:
|
||||
"""
|
||||
Add short performance cues for common emotions, keeping it single-line KV style.
|
||||
This helps when the model under-reacts to simple labels like "愤怒".
|
||||
"""
|
||||
s = (instruct or "").strip()
|
||||
if not s:
|
||||
return ""
|
||||
|
||||
fields = self._parse_instruct_fields(s)
|
||||
emo = fields.get("情绪", "")
|
||||
if not emo:
|
||||
return s
|
||||
|
||||
# Only add if it doesn't already contain a "表现=" field.
|
||||
if "表现" in fields:
|
||||
return s
|
||||
|
||||
emo_norm = emo
|
||||
cues = ""
|
||||
if "愤怒" in emo_norm or "生气" in emo_norm:
|
||||
cues = "声压高,咬字重,重音强,尾音下压"
|
||||
elif "开心" in emo_norm or "高兴" in emo_norm:
|
||||
cues = "笑意明显,轻快上扬,尾音明亮"
|
||||
elif "悲伤" in emo_norm or "难过" in emo_norm:
|
||||
cues = "气声略多,音量偏低,语尾下沉"
|
||||
elif "温柔" in emo_norm:
|
||||
cues = "音量轻,气声柔,语尾轻收"
|
||||
elif "冷淡" in emo_norm or "冷静" in emo_norm:
|
||||
cues = "平直克制,少起伏,干净收尾"
|
||||
|
||||
if not cues:
|
||||
return s
|
||||
|
||||
extra = f";表现={cues}"
|
||||
if max_chars and len(s) + len(extra) > max_chars:
|
||||
# Trim cues to fit.
|
||||
allow = max_chars - len(s) - len(";表现=")
|
||||
if allow <= 0:
|
||||
return s[:max_chars].rstrip()
|
||||
cues = cues[:allow].rstrip(",, ")
|
||||
extra = f";表现={cues}"
|
||||
return (s + extra)[:max_chars].rstrip() if max_chars else (s + extra)
|
||||
|
||||
def _apply_instruct_controls(
|
||||
self, instruct: str, speed: float, pause_cfg: Dict[str, float], max_chars: int
|
||||
) -> Tuple[str, float, Dict[str, float]]:
|
||||
"""
|
||||
If instruct contains '语速'/'停顿', map them to real synthesis controls.
|
||||
This makes auto_instruct meaningfully affect output even if the model is insensitive to labels.
|
||||
"""
|
||||
s = (instruct or "").strip()
|
||||
if not s:
|
||||
return "", speed, pause_cfg
|
||||
|
||||
fields = self._parse_instruct_fields(s)
|
||||
speed_label = fields.get("语速", "")
|
||||
pause_label = fields.get("停顿", "")
|
||||
|
||||
out_speed = float(speed)
|
||||
mapped_speed = self._map_speed_label(speed_label)
|
||||
if mapped_speed is not None:
|
||||
out_speed = mapped_speed
|
||||
|
||||
out_pause_cfg = dict(pause_cfg or {})
|
||||
mapped_pause = self._map_pause_label(pause_label)
|
||||
if mapped_pause is not None:
|
||||
out_pause_cfg = self._ensure_base_pause_cfg(out_pause_cfg)
|
||||
for k in ["pause_linebreak", "period_pause", "comma_pause", "question_pause", "hyphen_pause"]:
|
||||
try:
|
||||
out_pause_cfg[k] = float(out_pause_cfg.get(k, 0.0) or 0.0) * float(mapped_pause)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Add short performance cues (kept within max_chars).
|
||||
s = self._enrich_instruct_for_emotion(s, max_chars=max_chars)
|
||||
return s, out_speed, out_pause_cfg
|
||||
|
||||
async def _infer_instruct(
|
||||
self,
|
||||
text: str,
|
||||
detected_lang: str,
|
||||
chat_stream=None,
|
||||
chat_id: Optional[str] = None,
|
||||
style_name: str = "",
|
||||
) -> str:
|
||||
"""
|
||||
Infer a short CustomVoice `instruct` string from the target text via MaiBot's LLM interface.
|
||||
"""
|
||||
enabled = bool(self.get_config(ConfigKeys.COMFYUI_AUTO_INSTRUCT_ENABLED, False))
|
||||
if not enabled:
|
||||
return ""
|
||||
|
||||
max_chars = int(self.get_config(ConfigKeys.COMFYUI_AUTO_INSTRUCT_MAX_CHARS, 40) or 40)
|
||||
|
||||
# Default prompt: output ONE short instruct line only.
|
||||
default_tpl = (
|
||||
"你是配音导演。请根据要朗读的文本生成一行 TTS instruct。\\n"
|
||||
"硬性要求:必须同时包含【情绪】【语速】【停顿】三项。可以额外补充 1-2 个表演提示(如 音量/重音/音高/表现)。\\n"
|
||||
"只输出一行,不要解释,不要复述原文,不要引号/代码块。\\n"
|
||||
"输出格式固定为:情绪=<...>;语速=<...>;停顿=<...>\\n"
|
||||
"语速可选:很慢/稍慢/正常/稍快/很快。\\n"
|
||||
"停顿可选:很少/自然/稍多/很多。\\n"
|
||||
"长度<= {max_chars} 字。\\n"
|
||||
"文本语言: {lang}\\n"
|
||||
"待朗读文本: {text}\\n"
|
||||
)
|
||||
prompt_tpl = str(self.get_config(ConfigKeys.COMFYUI_AUTO_INSTRUCT_PROMPT, default_tpl) or "")
|
||||
if not prompt_tpl.strip():
|
||||
prompt_tpl = default_tpl
|
||||
|
||||
# Cache key should change if prompt/base_tone/max_chars changes.
|
||||
base_raw = str(self.get_config(ConfigKeys.COMFYUI_AUTO_INSTRUCT_BASE_TONE, "") or "")
|
||||
cfg_sig_src = f"{max_chars}\\n{prompt_tpl}\\n{base_raw}"
|
||||
cfg_sig = hashlib.sha256(cfg_sig_src.encode("utf-8")).hexdigest()[:12]
|
||||
text_sig = hashlib.sha256(text.encode("utf-8")).hexdigest()[:16]
|
||||
cache_key = f"{cfg_sig}:{detected_lang}:{text_sig}"
|
||||
cached = self._instruct_cache.get(cache_key)
|
||||
if cached:
|
||||
return cached
|
||||
|
||||
lang = detected_lang or "auto"
|
||||
prompt = prompt_tpl.format(text=text.strip(), lang=lang, max_chars=max_chars)
|
||||
|
||||
try:
|
||||
resp = await generator_api.generate_tts_instruct(
|
||||
prompt=prompt,
|
||||
request_type="tts_instruct",
|
||||
)
|
||||
instruct = self._clean_instruct(resp or "", max_chars=max_chars)
|
||||
instruct = self._attach_base_tone(instruct, max_chars=max_chars)
|
||||
if instruct:
|
||||
self._instruct_cache[cache_key] = instruct
|
||||
return instruct
|
||||
except Exception as e:
|
||||
logger.warning(f"{self.log_prefix} auto_instruct 失败(style={style_name}): {e}")
|
||||
return ""
|
||||
|
||||
def validate_config(self) -> Tuple[bool, str]:
|
||||
server = self.get_config(ConfigKeys.COMFYUI_SERVER, "http://127.0.0.1:8188")
|
||||
if not server:
|
||||
return False, "ComfyUI 未配置 server"
|
||||
|
||||
input_dir = self.get_config(
|
||||
ConfigKeys.COMFYUI_INPUT_DIR,
|
||||
"/Users/xenon/Downloads/seiun_tts/ComfyUI/ComfyUI/input",
|
||||
)
|
||||
if not input_dir:
|
||||
return False, "ComfyUI 未配置 input_dir"
|
||||
|
||||
styles_raw = self.get_config(ConfigKeys.COMFYUI_STYLES, {})
|
||||
styles = self._normalize_styles_config(styles_raw)
|
||||
if not styles:
|
||||
return False, "ComfyUI 后端未配置任何风格(至少需要配置 1 个 style)"
|
||||
|
||||
default_name = self.get_default_voice() or "default"
|
||||
if default_name not in styles:
|
||||
# Fallback to "default" if present.
|
||||
if "default" in styles:
|
||||
default_name = "default"
|
||||
else:
|
||||
return False, f"ComfyUI default_style='{default_name}' 不存在"
|
||||
|
||||
st = styles.get(default_name, {})
|
||||
mode = (st.get("mode") or "voice_clone").strip()
|
||||
if mode == "voice_clone":
|
||||
if not st.get("refer_wav") or not st.get("prompt_text"):
|
||||
return False, f"ComfyUI 风格 '{default_name}' 配置不完整(voice_clone 需要 refer_wav 和 prompt_text)"
|
||||
elif mode == "custom_voice":
|
||||
if not st.get("model_path") or not st.get("speaker"):
|
||||
return False, f"ComfyUI 风格 '{default_name}' 配置不完整(custom_voice 需要 model_path 和 speaker)"
|
||||
else:
|
||||
return False, f"ComfyUI 风格 '{default_name}' mode 无效: {mode}"
|
||||
|
||||
return True, ""
|
||||
|
||||
def _ensure_ref_in_input(self, input_dir: str, refer_wav: str) -> str:
|
||||
refer_wav = TTSFileManager.resolve_path(refer_wav)
|
||||
if not os.path.exists(refer_wav):
|
||||
raise FileNotFoundError(f"参考音频不存在: {refer_wav}")
|
||||
|
||||
st = os.stat(refer_wav)
|
||||
cache_key = f"{os.path.abspath(refer_wav)}:{st.st_mtime_ns}:{st.st_size}"
|
||||
if cache_key in self._ref_cache:
|
||||
name = self._ref_cache[cache_key]
|
||||
if os.path.exists(os.path.join(input_dir, name)):
|
||||
return name
|
||||
|
||||
ext = os.path.splitext(refer_wav)[1] or ".wav"
|
||||
h = hashlib.sha256(cache_key.encode("utf-8")).hexdigest()[:16]
|
||||
name = f"maibot_ref_{h}{ext}"
|
||||
dst = os.path.join(input_dir, name)
|
||||
|
||||
os.makedirs(input_dir, exist_ok=True)
|
||||
if not os.path.exists(dst):
|
||||
# Keep it simple: copy file bytes. LoadAudio can decode common formats (wav/mp3).
|
||||
import shutil
|
||||
|
||||
shutil.copyfile(refer_wav, dst)
|
||||
|
||||
self._ref_cache[cache_key] = name
|
||||
return name
|
||||
|
||||
def _build_prompt_voice_clone(
|
||||
self,
|
||||
ref_filename: str,
|
||||
ref_text: str,
|
||||
target_text: str,
|
||||
language: str,
|
||||
model_choice: str,
|
||||
precision: str,
|
||||
seed: int,
|
||||
max_new_tokens: int,
|
||||
top_p: float,
|
||||
top_k: int,
|
||||
temperature: float,
|
||||
repetition_penalty: float,
|
||||
audio_quality: str,
|
||||
mlx_python: str,
|
||||
mlx_cli: str,
|
||||
pause_cfg: Dict[str, float],
|
||||
) -> Dict[str, Any]:
|
||||
# Node IDs are arbitrary but stable in this prompt template.
|
||||
# 1: LoadAudio -> outputs AUDIO
|
||||
# 2: Pause config (FB_Qwen3TTSConfig) -> outputs TTS_CONFIG
|
||||
# 3: MLX VoiceClone -> outputs AUDIO
|
||||
# 4: SaveAudioMP3 -> outputs UI audio file info
|
||||
filename_prefix = f"audio/maibot_comfyui_{int(time.time())}_{uuid.uuid4().hex[:8]}"
|
||||
prompt: Dict[str, Any] = {
|
||||
"1": {
|
||||
"class_type": "LoadAudio",
|
||||
"inputs": {
|
||||
"audio": ref_filename,
|
||||
},
|
||||
},
|
||||
"2": {
|
||||
"class_type": "FB_Qwen3TTSConfig",
|
||||
"inputs": {
|
||||
"pause_linebreak": float(pause_cfg.get("pause_linebreak", 0.0)),
|
||||
"period_pause": float(pause_cfg.get("period_pause", 0.0)),
|
||||
"comma_pause": float(pause_cfg.get("comma_pause", 0.0)),
|
||||
"question_pause": float(pause_cfg.get("question_pause", 0.0)),
|
||||
"hyphen_pause": float(pause_cfg.get("hyphen_pause", 0.0)),
|
||||
},
|
||||
},
|
||||
"3": {
|
||||
"class_type": "MLX_Qwen3TTSVoiceClone",
|
||||
"inputs": {
|
||||
"target_text": target_text,
|
||||
"model_choice": model_choice,
|
||||
"device": "auto",
|
||||
"precision": precision,
|
||||
"language": language,
|
||||
"ref_audio": ["1", 0],
|
||||
"ref_text": ref_text,
|
||||
"seed": int(seed),
|
||||
"max_new_tokens": int(max_new_tokens),
|
||||
"top_p": float(top_p),
|
||||
"top_k": int(top_k),
|
||||
"temperature": float(temperature),
|
||||
"repetition_penalty": float(repetition_penalty),
|
||||
"attention": "auto",
|
||||
"unload_model_after_generate": False,
|
||||
"config": ["2", 0],
|
||||
"mlx_python": mlx_python,
|
||||
"mlx_cli": mlx_cli,
|
||||
},
|
||||
},
|
||||
"4": {
|
||||
"class_type": "SaveAudioMP3",
|
||||
"inputs": {
|
||||
"audio": ["3", 0],
|
||||
"filename_prefix": filename_prefix,
|
||||
"quality": audio_quality,
|
||||
},
|
||||
},
|
||||
}
|
||||
return prompt
|
||||
|
||||
def _build_prompt_custom_voice(
|
||||
self,
|
||||
target_text: str,
|
||||
speaker: str,
|
||||
model_path: str,
|
||||
instruct: str,
|
||||
speed: float,
|
||||
language: str,
|
||||
seed: int,
|
||||
max_new_tokens: int,
|
||||
top_p: float,
|
||||
top_k: int,
|
||||
temperature: float,
|
||||
repetition_penalty: float,
|
||||
audio_quality: str,
|
||||
mlx_python: str,
|
||||
mlx_cli: str,
|
||||
pause_cfg: Dict[str, float],
|
||||
) -> Dict[str, Any]:
|
||||
# 2: Pause config (FB_Qwen3TTSConfig) -> outputs TTS_CONFIG
|
||||
# 3: MLX CustomVoice -> outputs AUDIO
|
||||
# 4: SaveAudioMP3 -> outputs UI audio file info
|
||||
filename_prefix = f"audio/maibot_comfyui_{int(time.time())}_{uuid.uuid4().hex[:8]}"
|
||||
prompt: Dict[str, Any] = {
|
||||
"2": {
|
||||
"class_type": "FB_Qwen3TTSConfig",
|
||||
"inputs": {
|
||||
"pause_linebreak": float(pause_cfg.get("pause_linebreak", 0.0)),
|
||||
"period_pause": float(pause_cfg.get("period_pause", 0.0)),
|
||||
"comma_pause": float(pause_cfg.get("comma_pause", 0.0)),
|
||||
"question_pause": float(pause_cfg.get("question_pause", 0.0)),
|
||||
"hyphen_pause": float(pause_cfg.get("hyphen_pause", 0.0)),
|
||||
},
|
||||
},
|
||||
"3": {
|
||||
"class_type": "MLX_Qwen3TTSCustomVoice",
|
||||
"inputs": {
|
||||
"text": target_text,
|
||||
"speaker": speaker,
|
||||
"model_path": model_path,
|
||||
"instruct": instruct or "",
|
||||
"speed": float(speed),
|
||||
"language": language,
|
||||
"seed": int(seed),
|
||||
"max_new_tokens": int(max_new_tokens),
|
||||
"top_p": float(top_p),
|
||||
"top_k": int(top_k),
|
||||
"temperature": float(temperature),
|
||||
"repetition_penalty": float(repetition_penalty),
|
||||
"config": ["2", 0],
|
||||
"mlx_python": mlx_python,
|
||||
"mlx_cli": mlx_cli,
|
||||
},
|
||||
},
|
||||
"4": {
|
||||
"class_type": "SaveAudioMP3",
|
||||
"inputs": {
|
||||
"audio": ["3", 0],
|
||||
"filename_prefix": filename_prefix,
|
||||
"quality": audio_quality,
|
||||
},
|
||||
},
|
||||
}
|
||||
return prompt
|
||||
|
||||
async def _queue_and_wait(
|
||||
self, server: str, prompt: Dict[str, Any], timeout: int
|
||||
) -> Dict[str, Any]:
|
||||
session_manager = await TTSSessionManager.get_instance()
|
||||
prompt_id = str(uuid.uuid4())
|
||||
|
||||
post_url = f"{server.rstrip('/')}/prompt"
|
||||
payload = {
|
||||
"prompt": prompt,
|
||||
"client_id": "maibot-tts-voice-plugin",
|
||||
"prompt_id": prompt_id,
|
||||
}
|
||||
|
||||
async with session_manager.post(
|
||||
post_url, json=payload, backend_name=self.backend_name, timeout=timeout
|
||||
) as resp:
|
||||
data = await resp.json(content_type=None)
|
||||
if resp.status != 200:
|
||||
raise RuntimeError(f"ComfyUI /prompt 失败: {resp.status} {str(data)[:200]}")
|
||||
if "error" in data:
|
||||
raise RuntimeError(f"ComfyUI /prompt 返回错误: {data['error']}")
|
||||
|
||||
# Poll history until prompt_id appears
|
||||
hist_url = f"{server.rstrip('/')}/history/{prompt_id}"
|
||||
deadline = time.time() + float(timeout)
|
||||
while time.time() < deadline:
|
||||
async with session_manager.get(
|
||||
hist_url, backend_name=self.backend_name, timeout=timeout
|
||||
) as resp:
|
||||
history = await resp.json(content_type=None)
|
||||
if prompt_id in history:
|
||||
return history[prompt_id]
|
||||
await asyncio.sleep(0.35)
|
||||
|
||||
raise TimeoutError("等待 ComfyUI 生成超时")
|
||||
|
||||
async def _download_output_audio(self, server: str, history_item: Dict[str, Any], timeout: int) -> bytes:
|
||||
outputs = history_item.get("outputs") or {}
|
||||
node_out = outputs.get("4") or {}
|
||||
audios = node_out.get("audio") or []
|
||||
if not audios:
|
||||
# Some failures show up only in status/messages.
|
||||
status = history_item.get("status") or {}
|
||||
raise RuntimeError(f"ComfyUI 未产出音频. status={status}")
|
||||
|
||||
a0 = audios[0]
|
||||
filename = a0.get("filename")
|
||||
subfolder = a0.get("subfolder", "")
|
||||
folder_type = a0.get("type", "output")
|
||||
if not filename:
|
||||
raise RuntimeError(f"ComfyUI 音频输出结构异常: {a0}")
|
||||
|
||||
q = urlencode({"filename": filename, "subfolder": subfolder, "type": folder_type})
|
||||
url = f"{server.rstrip('/')}/view?{q}"
|
||||
|
||||
session_manager = await TTSSessionManager.get_instance()
|
||||
async with session_manager.get(url, backend_name=self.backend_name, timeout=timeout) as resp:
|
||||
if resp.status != 200:
|
||||
txt = await resp.text()
|
||||
raise RuntimeError(f"ComfyUI /view 失败: {resp.status} {txt[:200]}")
|
||||
return await resp.read()
|
||||
|
||||
async def execute(self, text: str, voice: Optional[str] = None, **kwargs) -> TTSResult:
|
||||
is_valid, err = self.validate_config()
|
||||
if not is_valid:
|
||||
return TTSResult(False, err, backend_name=self.backend_name)
|
||||
|
||||
if not text or not text.strip():
|
||||
return TTSResult(False, "待合成的文本为空", backend_name=self.backend_name)
|
||||
|
||||
server = self.get_config(ConfigKeys.COMFYUI_SERVER, "http://127.0.0.1:8188")
|
||||
input_dir = self.get_config(
|
||||
ConfigKeys.COMFYUI_INPUT_DIR,
|
||||
"/Users/xenon/Downloads/seiun_tts/ComfyUI/ComfyUI/input",
|
||||
)
|
||||
timeout = int(self.get_config(ConfigKeys.COMFYUI_TIMEOUT, self.get_config(ConfigKeys.GENERAL_TIMEOUT, 60)))
|
||||
|
||||
audio_quality = self.get_config(ConfigKeys.COMFYUI_AUDIO_QUALITY, "128k")
|
||||
mlx_python = self.get_config(
|
||||
ConfigKeys.COMFYUI_MLX_PYTHON,
|
||||
"/Users/xenon/Downloads/seiun_tts/qwen3-tts-apple-silicon/.venv/bin/python",
|
||||
)
|
||||
mlx_cli = self.get_config(
|
||||
ConfigKeys.COMFYUI_MLX_CLI,
|
||||
"/Users/xenon/Downloads/seiun_tts/qwen3-tts-apple-silicon/mlx_voice_clone_cli.py",
|
||||
)
|
||||
|
||||
styles_raw = self.get_config(ConfigKeys.COMFYUI_STYLES, {})
|
||||
styles = self._filter_styles_by_mode(self._normalize_styles_config(styles_raw))
|
||||
|
||||
style_name = (voice or self.get_default_voice() or "").strip() or "default"
|
||||
if style_name not in styles:
|
||||
# For split backends (voiceclone/customvoice), make "wrong style" errors explicit.
|
||||
if (voice or "").strip() and self.allowed_modes:
|
||||
return TTSResult(
|
||||
False,
|
||||
f"ComfyUI风格 '{style_name}' 不存在或不属于当前后端({self.backend_name})",
|
||||
backend_name=self.backend_name,
|
||||
)
|
||||
# Fallback order: "default" -> first available style.
|
||||
if "default" in styles:
|
||||
style_name = "default"
|
||||
elif styles:
|
||||
style_name = sorted(styles.keys())[0]
|
||||
else:
|
||||
return TTSResult(
|
||||
False,
|
||||
f"ComfyUI 未配置任何风格({self.backend_name})",
|
||||
backend_name=self.backend_name,
|
||||
)
|
||||
style = styles.get(style_name, {})
|
||||
|
||||
mode = (style.get("mode") or "voice_clone").strip()
|
||||
if mode == "voice_clone":
|
||||
refer_wav = style.get("refer_wav", "")
|
||||
prompt_text = style.get("prompt_text", "")
|
||||
if not refer_wav or not prompt_text:
|
||||
return TTSResult(False, f"ComfyUI风格 '{style_name}' 配置不完整(voice_clone)", backend_name=self.backend_name)
|
||||
elif mode == "custom_voice":
|
||||
model_path = style.get("model_path", "")
|
||||
speaker = style.get("speaker", "")
|
||||
if not model_path or not speaker:
|
||||
return TTSResult(False, f"ComfyUI风格 '{style_name}' 配置不完整(custom_voice)", backend_name=self.backend_name)
|
||||
else:
|
||||
return TTSResult(False, f"ComfyUI风格 '{style_name}' mode 无效: {mode}", backend_name=self.backend_name)
|
||||
|
||||
# Map language to the MLX node's language combo. Default to Auto.
|
||||
detected = TTSTextUtils.detect_language(text)
|
||||
language = style.get("language") or LANG_TO_DEMO.get(detected, "Auto")
|
||||
|
||||
# Sampling defaults match the MLX node defaults we exposed.
|
||||
seed = int(style.get("seed", 0) or 0)
|
||||
model_choice = str(style.get("model_choice", "1.7B") or "1.7B")
|
||||
precision = str(style.get("precision", "bf16") or "bf16")
|
||||
max_new_tokens = int(style.get("max_new_tokens", 2048) or 2048)
|
||||
top_p = float(style.get("top_p", 0.8) or 0.8)
|
||||
top_k = int(style.get("top_k", 20) or 20)
|
||||
temperature = float(style.get("temperature", 1.0) or 1.0)
|
||||
repetition_penalty = float(style.get("repetition_penalty", 1.05) or 1.05)
|
||||
|
||||
pause_cfg = {
|
||||
"pause_linebreak": float(self.get_config(ConfigKeys.COMFYUI_PAUSE_LINEBREAK, 0.0)),
|
||||
"period_pause": float(self.get_config(ConfigKeys.COMFYUI_PERIOD_PAUSE, 0.0)),
|
||||
"comma_pause": float(self.get_config(ConfigKeys.COMFYUI_COMMA_PAUSE, 0.0)),
|
||||
"question_pause": float(self.get_config(ConfigKeys.COMFYUI_QUESTION_PAUSE, 0.0)),
|
||||
"hyphen_pause": float(self.get_config(ConfigKeys.COMFYUI_HYPHEN_PAUSE, 0.0)),
|
||||
}
|
||||
# Allow per-style override.
|
||||
if isinstance(style.get("pause_cfg"), dict):
|
||||
for k in pause_cfg.keys():
|
||||
if k in style["pause_cfg"]:
|
||||
try:
|
||||
pause_cfg[k] = float(style["pause_cfg"][k])
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
if mode == "voice_clone":
|
||||
ref_filename = self._ensure_ref_in_input(input_dir, style.get("refer_wav", ""))
|
||||
prompt = self._build_prompt_voice_clone(
|
||||
ref_filename=ref_filename,
|
||||
ref_text=style.get("prompt_text", ""),
|
||||
target_text=text,
|
||||
language=language,
|
||||
model_choice=model_choice,
|
||||
precision=precision,
|
||||
seed=seed,
|
||||
max_new_tokens=max_new_tokens,
|
||||
top_p=top_p,
|
||||
top_k=top_k,
|
||||
temperature=temperature,
|
||||
repetition_penalty=repetition_penalty,
|
||||
audio_quality=audio_quality,
|
||||
mlx_python=mlx_python,
|
||||
mlx_cli=mlx_cli,
|
||||
pause_cfg=pause_cfg,
|
||||
)
|
||||
else:
|
||||
# Allow per-style / automatic instruct inference.
|
||||
instruct = str(style.get("instruct", "")).strip()
|
||||
auto_style = bool(style.get("auto_instruct", False))
|
||||
inferred = ""
|
||||
if instruct == "__AUTO__" or (not instruct and auto_style):
|
||||
chat_stream = kwargs.get("chat_stream")
|
||||
chat_id = kwargs.get("chat_id")
|
||||
inferred = await self._infer_instruct(
|
||||
text=text,
|
||||
detected_lang=detected,
|
||||
chat_stream=chat_stream,
|
||||
chat_id=chat_id,
|
||||
style_name=style_name,
|
||||
)
|
||||
if inferred:
|
||||
instruct = inferred
|
||||
|
||||
# If the instruct contains usable fields, map them to real controls.
|
||||
max_chars = int(self.get_config(ConfigKeys.COMFYUI_AUTO_INSTRUCT_MAX_CHARS, 40) or 40)
|
||||
instruct, mapped_speed, mapped_pause_cfg = self._apply_instruct_controls(
|
||||
instruct=instruct,
|
||||
speed=float(style.get("speed", 1.0) or 1.0),
|
||||
pause_cfg=pause_cfg,
|
||||
max_chars=max_chars,
|
||||
)
|
||||
|
||||
prompt = self._build_prompt_custom_voice(
|
||||
target_text=text,
|
||||
speaker=str(style.get("speaker", "")).strip(),
|
||||
model_path=str(style.get("model_path", "")).strip(),
|
||||
instruct=instruct,
|
||||
speed=mapped_speed,
|
||||
language=language,
|
||||
seed=seed,
|
||||
max_new_tokens=max_new_tokens,
|
||||
top_p=top_p,
|
||||
top_k=top_k,
|
||||
temperature=temperature,
|
||||
repetition_penalty=repetition_penalty,
|
||||
audio_quality=audio_quality,
|
||||
mlx_python=mlx_python,
|
||||
mlx_cli=mlx_cli,
|
||||
pause_cfg=mapped_pause_cfg,
|
||||
)
|
||||
|
||||
logger.info(f"{self.log_prefix} ComfyUI请求: text='{text[:50]}...', style={style_name}")
|
||||
history_item = await self._queue_and_wait(server, prompt, timeout=timeout)
|
||||
audio_bytes = await self._download_output_audio(server, history_item, timeout=timeout)
|
||||
|
||||
ok, msg = TTSFileManager.validate_audio_data(audio_bytes)
|
||||
if not ok:
|
||||
return TTSResult(False, f"ComfyUI 返回音频无效: {msg}", backend_name=self.backend_name)
|
||||
|
||||
return await self.send_audio(
|
||||
audio_data=audio_bytes,
|
||||
audio_format="mp3",
|
||||
prefix="tts_comfyui",
|
||||
voice_info=f"style: {style_name}",
|
||||
)
|
||||
except Exception as e:
|
||||
return TTSResult(False, f"ComfyUI后端错误: {e}", backend_name=self.backend_name)
|
||||
|
||||
|
||||
class ComfyUIVoiceCloneBackend(ComfyUIBackend):
|
||||
backend_name = "comfyui_voiceclone"
|
||||
backend_description = "ComfyUI 工作流 API(MLX Qwen3-TTS VoiceClone 专用)"
|
||||
allowed_modes = {"voice_clone"}
|
||||
|
||||
def get_default_voice(self) -> str:
|
||||
v = self.get_config(ConfigKeys.COMFYUI_VOICECLONE_DEFAULT_STYLE, "") or ""
|
||||
v = v.strip()
|
||||
return v or super().get_default_voice()
|
||||
|
||||
|
||||
class ComfyUICustomVoiceBackend(ComfyUIBackend):
|
||||
backend_name = "comfyui_customvoice"
|
||||
backend_description = "ComfyUI 工作流 API(MLX Qwen3-TTS CustomVoice 专用)"
|
||||
allowed_modes = {"custom_voice"}
|
||||
|
||||
def get_default_voice(self) -> str:
|
||||
v = self.get_config(ConfigKeys.COMFYUI_CUSTOMVOICE_DEFAULT_STYLE, "") or ""
|
||||
v = v.strip()
|
||||
return v or super().get_default_voice()
|
||||
|
|
@ -0,0 +1,285 @@
|
|||
"""
|
||||
CosyVoice后端实现
|
||||
使用 ModelScope 的 Fun-CosyVoice3-0.5B Gradio API 进行语音合成
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import shutil
|
||||
from typing import Optional, Tuple
|
||||
from .base import TTSBackendBase, TTSResult
|
||||
from ..utils.file import TTSFileManager
|
||||
from ..config_keys import ConfigKeys
|
||||
from src.common.logger import get_logger
|
||||
|
||||
logger = get_logger("tts_cosyvoice")
|
||||
|
||||
# CosyVoice指令映射表(方言、情感、语速等)
|
||||
COSYVOICE_INSTRUCT_MAP = {
|
||||
# 方言
|
||||
"广东话": "You are a helpful assistant. 请用广东话表达。<|endofprompt|>",
|
||||
"东北话": "You are a helpful assistant. 请用东北话表达。<|endofprompt|>",
|
||||
"甘肃话": "You are a helpful assistant. 请用甘肃话表达。<|endofprompt|>",
|
||||
"贵州话": "You are a helpful assistant. 请用贵州话表达。<|endofprompt|>",
|
||||
"河南话": "You are a helpful assistant. 请用河南话表达。<|endofprompt|>",
|
||||
"湖北话": "You are a helpful assistant. 请用湖北话表达。<|endofprompt|>",
|
||||
"湖南话": "You are a helpful assistant. 请用湖南话表达。<|endofprompt|>",
|
||||
"江西话": "You are a helpful assistant. 请用江西话表达。<|endofprompt|>",
|
||||
"闽南话": "You are a helpful assistant. 请用闽南话表达。<|endofprompt|>",
|
||||
"宁夏话": "You are a helpful assistant. 请用宁夏话表达。<|endofprompt|>",
|
||||
"山西话": "You are a helpful assistant. 请用山西话表达。<|endofprompt|>",
|
||||
"陕西话": "You are a helpful assistant. 请用陕西话表达。<|endofprompt|>",
|
||||
"山东话": "You are a helpful assistant. 请用山东话表达。<|endofprompt|>",
|
||||
"上海话": "You are a helpful assistant. 请用上海话表达。<|endofprompt|>",
|
||||
"四川话": "You are a helpful assistant. 请用四川话表达。<|endofprompt|>",
|
||||
"天津话": "You are a helpful assistant. 请用天津话表达。<|endofprompt|>",
|
||||
"云南话": "You are a helpful assistant. 请用云南话表达。<|endofprompt|>",
|
||||
|
||||
# 音量
|
||||
"大声": "You are a helpful assistant. Please say a sentence as loudly as possible.<|endofprompt|>",
|
||||
"小声": "You are a helpful assistant. Please say a sentence in a very soft voice.<|endofprompt|>",
|
||||
|
||||
# 语速
|
||||
"慢速": "You are a helpful assistant. 请用尽可能慢地语速说一句话。<|endofprompt|>",
|
||||
"快速": "You are a helpful assistant. 请用尽可能快地语速说一句话。<|endofprompt|>",
|
||||
|
||||
# 情感
|
||||
"开心": "You are a helpful assistant. 请非常开心地说一句话。<|endofprompt|>",
|
||||
"伤心": "You are a helpful assistant. 请非常伤心地说一句话。<|endofprompt|>",
|
||||
"生气": "You are a helpful assistant. 请非常生气地说一句话。<|endofprompt|>",
|
||||
|
||||
# 特殊风格
|
||||
"小猪佩奇": "You are a helpful assistant. 我想体验一下小猪佩奇风格,可以吗?<|endofprompt|>",
|
||||
"机器人": "You are a helpful assistant. 你可以尝试用机器人的方式解答吗?<|endofprompt|>",
|
||||
}
|
||||
|
||||
|
||||
class CosyVoiceBackend(TTSBackendBase):
|
||||
"""
|
||||
CosyVoice语音后端
|
||||
|
||||
使用 ModelScope 的 Fun-CosyVoice3-0.5B Gradio API 进行语音合成
|
||||
支持3秒极速复刻、自然语言控制(方言、情感、语速等)
|
||||
"""
|
||||
|
||||
backend_name = "cosyvoice"
|
||||
backend_description = "阿里云 CosyVoice3 API (ModelScope Gradio)"
|
||||
support_private_chat = True
|
||||
default_audio_format = "wav"
|
||||
|
||||
def get_default_voice(self) -> str:
|
||||
"""获取默认音色(CosyVoice 不需要预设音色)"""
|
||||
return ""
|
||||
|
||||
def validate_config(self) -> Tuple[bool, str]:
|
||||
"""验证配置"""
|
||||
gradio_url = self.get_config(ConfigKeys.COSYVOICE_GRADIO_URL, "")
|
||||
|
||||
if not gradio_url:
|
||||
return False, "CosyVoice后端缺少必需的 gradio_url 配置"
|
||||
|
||||
return True, ""
|
||||
|
||||
def _resolve_instruct(self, emotion: Optional[str]) -> str:
|
||||
"""
|
||||
解析情感参数为指令文本
|
||||
|
||||
Args:
|
||||
emotion: 情感/方言关键词
|
||||
|
||||
Returns:
|
||||
指令文本
|
||||
"""
|
||||
if emotion and emotion in COSYVOICE_INSTRUCT_MAP:
|
||||
return COSYVOICE_INSTRUCT_MAP[emotion]
|
||||
|
||||
# 返回默认指令(确保不为空)
|
||||
default_instruct = self.get_config(
|
||||
ConfigKeys.COSYVOICE_DEFAULT_INSTRUCT,
|
||||
"You are a helpful assistant. 请用广东话表达。<|endofprompt|>"
|
||||
)
|
||||
|
||||
# 如果配置为空,强制使用广东话
|
||||
if not default_instruct or not default_instruct.strip():
|
||||
default_instruct = "You are a helpful assistant. 请用广东话表达。<|endofprompt|>"
|
||||
|
||||
return default_instruct
|
||||
|
||||
async def execute(
|
||||
self,
|
||||
text: str,
|
||||
voice: Optional[str] = None,
|
||||
emotion: Optional[str] = None,
|
||||
**kwargs
|
||||
) -> TTSResult:
|
||||
"""
|
||||
执行 CosyVoice 语音合成
|
||||
|
||||
Args:
|
||||
text: 待转换的文本
|
||||
voice: 音色(对于CosyVoice,这个参数用于指定参考音频路径)
|
||||
emotion: 情感/方言/语速参数
|
||||
|
||||
Returns:
|
||||
TTSResult
|
||||
"""
|
||||
# 验证配置
|
||||
is_valid, error_msg = self.validate_config()
|
||||
if not is_valid:
|
||||
return TTSResult(False, error_msg, backend_name=self.backend_name)
|
||||
|
||||
# 验证文本
|
||||
if not text or not text.strip():
|
||||
return TTSResult(False, "待合成的文本为空", backend_name=self.backend_name)
|
||||
|
||||
# 获取配置
|
||||
gradio_url = self.get_config(ConfigKeys.COSYVOICE_GRADIO_URL, "")
|
||||
mode_config = self.get_config(ConfigKeys.COSYVOICE_DEFAULT_MODE, "3s极速复刻")
|
||||
|
||||
# mode_checkbox_group 实际上是 Radio 组件,期望字符串而不是列表
|
||||
# 处理配置可能返回字符串或列表的情况
|
||||
if isinstance(mode_config, list):
|
||||
mode_str = mode_config[0] if mode_config else "3s极速复刻"
|
||||
else:
|
||||
mode_str = mode_config if mode_config else "3s极速复刻"
|
||||
|
||||
timeout = self.get_config(ConfigKeys.COSYVOICE_TIMEOUT, 60)
|
||||
reference_audio = self.get_config(ConfigKeys.COSYVOICE_REFERENCE_AUDIO, "")
|
||||
prompt_text = self.get_config(ConfigKeys.COSYVOICE_PROMPT_TEXT, "")
|
||||
|
||||
# CosyVoice 的"自然语言控制"模式实际上需要参考音频和 prompt_text
|
||||
# 如果没有配置,使用默认的参考音频
|
||||
if not reference_audio or not os.path.exists(reference_audio):
|
||||
plugin_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
default_audio = os.path.join(plugin_dir, "test.wav")
|
||||
if os.path.exists(default_audio):
|
||||
reference_audio = default_audio
|
||||
logger.debug(f"{self.log_prefix} 使用默认参考音频: {reference_audio}")
|
||||
|
||||
# 如果没有 prompt_text,使用默认文本
|
||||
if not prompt_text:
|
||||
prompt_text = "大家好,我是嘉然,今天我来为大家朗读。"
|
||||
logger.debug(f"{self.log_prefix} 使用默认 prompt_text")
|
||||
|
||||
# voice 参数可以覆盖配置文件中的参考音频
|
||||
if voice and os.path.exists(voice):
|
||||
reference_audio = voice
|
||||
|
||||
# 解析指令文本
|
||||
instruct_text = self._resolve_instruct(emotion)
|
||||
|
||||
logger.info(
|
||||
f"{self.log_prefix} CosyVoice请求: text='{text[:50]}...' "
|
||||
f"(共{len(text)}字符), mode={mode_str}, instruct={emotion or '默认'}"
|
||||
)
|
||||
|
||||
try:
|
||||
# 动态导入 gradio_client(避免全局依赖)
|
||||
try:
|
||||
from gradio_client import Client, handle_file
|
||||
except ImportError:
|
||||
logger.error(f"{self.log_prefix} gradio_client 未安装,请运行: pip install gradio_client")
|
||||
return TTSResult(
|
||||
False,
|
||||
"gradio_client 未安装,请运行: pip install gradio_client",
|
||||
backend_name=self.backend_name
|
||||
)
|
||||
|
||||
# 创建 Gradio 客户端(设置超时)
|
||||
try:
|
||||
import httpx
|
||||
httpx_kwargs = {"timeout": httpx.Timeout(timeout, read=timeout, write=timeout, connect=30.0)}
|
||||
client = Client(gradio_url, httpx_kwargs=httpx_kwargs)
|
||||
except Exception as e:
|
||||
logger.warning(f"{self.log_prefix} 无法设置 httpx 超时,使用默认配置: {e}")
|
||||
client = Client(gradio_url)
|
||||
|
||||
# 准备参数
|
||||
logger.debug(f"{self.log_prefix} 准备参考音频: {reference_audio}")
|
||||
prompt_wav_upload = handle_file(reference_audio) if reference_audio and os.path.exists(reference_audio) else None
|
||||
logger.debug(f"{self.log_prefix} 参考音频准备完成")
|
||||
|
||||
# 调用 API
|
||||
logger.info(f"{self.log_prefix} 调用 Gradio API: {gradio_url} (超时: {timeout}秒)")
|
||||
logger.debug(f"{self.log_prefix} mode参数: {mode_str} (type: {type(mode_str).__name__})")
|
||||
logger.debug(f"{self.log_prefix} prompt_text: {prompt_text[:50]}...")
|
||||
logger.debug(f"{self.log_prefix} instruct_text: {instruct_text[:50]}...")
|
||||
|
||||
result = await asyncio.wait_for(
|
||||
asyncio.to_thread(
|
||||
client.predict,
|
||||
tts_text=text,
|
||||
mode_checkbox_group=mode_str,
|
||||
prompt_text=prompt_text,
|
||||
prompt_wav_upload=prompt_wav_upload,
|
||||
prompt_wav_record=None,
|
||||
instruct_text=instruct_text,
|
||||
seed=0,
|
||||
stream=False, # API 实际期望布尔值 False,虽然文档显示为 Literal['False']
|
||||
api_name="/generate_audio"
|
||||
),
|
||||
timeout=timeout
|
||||
)
|
||||
|
||||
logger.info(f"{self.log_prefix} CosyVoice API 响应成功")
|
||||
|
||||
# result 是生成的音频文件路径
|
||||
if not result or not os.path.exists(result):
|
||||
return TTSResult(
|
||||
False,
|
||||
f"CosyVoice 生成失败,未返回有效文件: {result}",
|
||||
backend_name=self.backend_name
|
||||
)
|
||||
|
||||
# 读取音频数据
|
||||
try:
|
||||
with open(result, 'rb') as f:
|
||||
audio_data = f.read()
|
||||
except Exception as e:
|
||||
logger.error(f"{self.log_prefix} 读取音频文件失败: {e}")
|
||||
return TTSResult(
|
||||
False,
|
||||
f"读取音频文件失败: {e}",
|
||||
backend_name=self.backend_name
|
||||
)
|
||||
|
||||
# 验证音频数据
|
||||
is_valid, error_msg = TTSFileManager.validate_audio_data(audio_data)
|
||||
if not is_valid:
|
||||
logger.warning(f"{self.log_prefix} CosyVoice音频数据验证失败: {error_msg}")
|
||||
return TTSResult(
|
||||
False,
|
||||
f"CosyVoice语音{error_msg}",
|
||||
backend_name=self.backend_name
|
||||
)
|
||||
|
||||
logger.debug(
|
||||
f"{self.log_prefix} CosyVoice音频数据验证通过 "
|
||||
f"(大小: {len(audio_data)}字节)"
|
||||
)
|
||||
|
||||
# 使用统一的发送方法
|
||||
audio_format = self.get_config(ConfigKeys.COSYVOICE_AUDIO_FORMAT, "wav")
|
||||
voice_info = f"模式: {mode_str}, 指令: {emotion or '默认'}"
|
||||
|
||||
return await self.send_audio(
|
||||
audio_data=audio_data,
|
||||
audio_format=audio_format,
|
||||
prefix="tts_cosyvoice",
|
||||
voice_info=voice_info
|
||||
)
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
logger.error(f"{self.log_prefix} CosyVoice API 请求超时 (配置超时: {timeout}秒)")
|
||||
return TTSResult(
|
||||
False,
|
||||
"CosyVoice API 调用超时",
|
||||
backend_name=self.backend_name
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"{self.log_prefix} CosyVoice 执行异常: {e}")
|
||||
return TTSResult(
|
||||
False,
|
||||
f"CosyVoice 执行错误: {e}",
|
||||
backend_name=self.backend_name
|
||||
)
|
||||
|
|
@ -0,0 +1,230 @@
|
|||
"""
|
||||
豆包语音后端实现
|
||||
使用字节跳动豆包语音 API 进行语音合成
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import uuid
|
||||
from typing import Optional, List, Dict, Tuple
|
||||
from .base import TTSBackendBase, TTSResult
|
||||
from .doubao_stream_parser import DoubaoStreamParser
|
||||
from ..utils.file import TTSFileManager
|
||||
from ..utils.session import TTSSessionManager
|
||||
from ..config_keys import ConfigKeys
|
||||
from src.common.logger import get_logger
|
||||
|
||||
logger = get_logger("tts_doubao")
|
||||
|
||||
# 豆包语音情感映射表(用于自动生成context_texts)
|
||||
DOUBAO_EMOTION_MAP = {
|
||||
# 积极情绪
|
||||
"开心": "你的语气再欢乐一点",
|
||||
"兴奋": "用特别兴奋激动的语气说话",
|
||||
"温柔": "用温柔体贴的语气说话",
|
||||
"骄傲": "用骄傲的语气说话",
|
||||
"自信": "用自信坚定的语气说话",
|
||||
|
||||
# 消极情绪
|
||||
"生气": "你得跟我互怼!就是跟我用吵架的语气对话",
|
||||
"愤怒": "用愤怒的语气说话",
|
||||
"伤心": "用特别特别痛心的语气说话",
|
||||
"失望": "用失望沮丧的语气说话",
|
||||
"委屈": "用委屈的语气说话",
|
||||
|
||||
# 中性情绪
|
||||
"平静": "用平静淡定的语气说话",
|
||||
"严肃": "用严肃认真的语气说话",
|
||||
"疑惑": "用疑惑不解的语气说话",
|
||||
|
||||
# 语速调整
|
||||
"慢速": "说慢一点",
|
||||
"快速": "说快一点",
|
||||
|
||||
# 音量调整
|
||||
"小声": "你嗓门再小点",
|
||||
"大声": "大声一点",
|
||||
}
|
||||
|
||||
|
||||
class DoubaoBackend(TTSBackendBase):
|
||||
"""
|
||||
豆包语音后端
|
||||
|
||||
使用字节跳动豆包语音 API 进行高质量语音合成
|
||||
支持预置音色和复刻音色
|
||||
"""
|
||||
|
||||
backend_name = "doubao"
|
||||
backend_description = "字节跳动豆包语音API"
|
||||
support_private_chat = True
|
||||
default_audio_format = "mp3"
|
||||
|
||||
def get_default_voice(self) -> str:
|
||||
"""获取默认音色"""
|
||||
return self.get_config(ConfigKeys.DOUBAO_DEFAULT_VOICE, "zh_female_shuangkuaisisi_moon_bigtts")
|
||||
|
||||
def validate_config(self) -> Tuple[bool, str]:
|
||||
"""验证配置"""
|
||||
app_id = self.get_config(ConfigKeys.DOUBAO_APP_ID, "")
|
||||
access_key = self.get_config(ConfigKeys.DOUBAO_ACCESS_KEY, "")
|
||||
resource_id = self.get_config(ConfigKeys.DOUBAO_RESOURCE_ID, "")
|
||||
|
||||
if not app_id or not access_key or not resource_id:
|
||||
return False, "豆包语音后端缺少必需的认证配置(app_id/access_key/resource_id)"
|
||||
|
||||
return True, ""
|
||||
|
||||
def _resolve_emotion(self, emotion: Optional[str]) -> Optional[List[str]]:
|
||||
"""
|
||||
解析情感参数为 context_texts
|
||||
|
||||
Args:
|
||||
emotion: 情感关键词
|
||||
|
||||
Returns:
|
||||
context_texts 列表或 None
|
||||
"""
|
||||
if emotion and emotion in DOUBAO_EMOTION_MAP:
|
||||
return [DOUBAO_EMOTION_MAP[emotion]]
|
||||
return None
|
||||
|
||||
async def execute(
|
||||
self,
|
||||
text: str,
|
||||
voice: Optional[str] = None,
|
||||
emotion: Optional[str] = None,
|
||||
**kwargs
|
||||
) -> TTSResult:
|
||||
"""
|
||||
执行豆包语音合成
|
||||
|
||||
Args:
|
||||
text: 待转换的文本
|
||||
voice: 音色ID
|
||||
emotion: 情感/语气参数
|
||||
|
||||
Returns:
|
||||
TTSResult
|
||||
"""
|
||||
# 验证配置
|
||||
is_valid, error_msg = self.validate_config()
|
||||
if not is_valid:
|
||||
return TTSResult(False, error_msg, backend_name=self.backend_name)
|
||||
|
||||
# 验证文本
|
||||
if not text or not text.strip():
|
||||
return TTSResult(False, "待合成的文本为空", backend_name=self.backend_name)
|
||||
|
||||
# 获取配置
|
||||
api_url = self.get_config(ConfigKeys.DOUBAO_API_URL, "https://openspeech.bytedance.com/api/v3/tts/unidirectional")
|
||||
app_id = self.get_config(ConfigKeys.DOUBAO_APP_ID, "")
|
||||
access_key = self.get_config(ConfigKeys.DOUBAO_ACCESS_KEY, "")
|
||||
resource_id = self.get_config(ConfigKeys.DOUBAO_RESOURCE_ID, "")
|
||||
timeout = self.get_config(ConfigKeys.DOUBAO_TIMEOUT, 30)
|
||||
|
||||
if not voice:
|
||||
voice = self.get_default_voice()
|
||||
|
||||
# 构建请求头
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"X-Api-App-Id": app_id,
|
||||
"X-Api-Access-Key": access_key,
|
||||
"X-Api-Resource-Id": resource_id,
|
||||
"X-Api-Request-Id": str(uuid.uuid4()),
|
||||
"Accept-Encoding": "gzip, deflate"
|
||||
}
|
||||
|
||||
# 构建请求体
|
||||
request_data: Dict[str, any] = {
|
||||
"req_params": {
|
||||
"text": text,
|
||||
"speaker": voice,
|
||||
"audio_params": {
|
||||
"format": self.get_config(ConfigKeys.DOUBAO_AUDIO_FORMAT, "mp3"),
|
||||
"sample_rate": self.get_config(ConfigKeys.DOUBAO_SAMPLE_RATE, 24000),
|
||||
"bitrate": self.get_config(ConfigKeys.DOUBAO_BITRATE, 128000)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# 添加可选参数
|
||||
speed = self.get_config(ConfigKeys.DOUBAO_SPEED, None)
|
||||
if speed is not None:
|
||||
request_data["req_params"]["speed"] = speed
|
||||
|
||||
volume = self.get_config(ConfigKeys.DOUBAO_VOLUME, None)
|
||||
if volume is not None:
|
||||
request_data["req_params"]["volume"] = volume
|
||||
|
||||
# 处理 context_texts
|
||||
context_texts: Optional[List[str]] = None
|
||||
|
||||
# 优先使用传入的emotion参数
|
||||
if emotion:
|
||||
context_texts = self._resolve_emotion(emotion)
|
||||
if context_texts:
|
||||
logger.info(f"{self.log_prefix} 使用emotion参数: {emotion} -> {context_texts[0]}")
|
||||
|
||||
# 否则使用配置文件的默认值
|
||||
if not context_texts:
|
||||
context_texts = self.get_config(ConfigKeys.DOUBAO_CONTEXT_TEXTS, None)
|
||||
|
||||
if context_texts:
|
||||
request_data["req_params"]["context_texts"] = context_texts
|
||||
|
||||
logger.info(f"{self.log_prefix} 豆包语音请求: text='{text[:50]}...' (共{len(text)}字符), voice={voice}")
|
||||
|
||||
try:
|
||||
session_manager = await TTSSessionManager.get_instance()
|
||||
async with session_manager.post(
|
||||
api_url,
|
||||
json=request_data,
|
||||
headers=headers,
|
||||
backend_name="doubao",
|
||||
timeout=timeout
|
||||
) as response:
|
||||
logger.info(f"{self.log_prefix} 豆包API响应状态码: {response.status}")
|
||||
|
||||
if response.status == 200:
|
||||
# 使用新的流式响应解析器
|
||||
audio_data, error_msg = await DoubaoStreamParser.parse_response(
|
||||
response,
|
||||
log_prefix=self.log_prefix
|
||||
)
|
||||
|
||||
if error_msg:
|
||||
logger.error(f"{self.log_prefix} 豆包语音解析失败: {error_msg}")
|
||||
return TTSResult(False, error_msg, backend_name=self.backend_name)
|
||||
|
||||
# 验证音频数据
|
||||
is_valid, error_msg = TTSFileManager.validate_audio_data(audio_data)
|
||||
if not is_valid:
|
||||
logger.warning(f"{self.log_prefix} 豆包音频数据验证失败: {error_msg}")
|
||||
return TTSResult(False, f"豆包语音{error_msg}", backend_name=self.backend_name)
|
||||
|
||||
logger.debug(f"{self.log_prefix} 豆包音频数据验证通过 (大小: {len(audio_data)}字节)")
|
||||
|
||||
# 使用统一的发送方法
|
||||
audio_format = self.get_config(ConfigKeys.DOUBAO_AUDIO_FORMAT, "mp3")
|
||||
return await self.send_audio(
|
||||
audio_data=audio_data,
|
||||
audio_format=audio_format,
|
||||
prefix="tts_doubao",
|
||||
voice_info=f"音色: {voice}"
|
||||
)
|
||||
else:
|
||||
error_text = await response.text()
|
||||
logger.error(f"{self.log_prefix} 豆包API请求失败[{response.status}]: {error_text[:200]}")
|
||||
return TTSResult(
|
||||
False,
|
||||
f"豆包语音API调用失败: {response.status} - {error_text[:100]}",
|
||||
backend_name=self.backend_name
|
||||
)
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
logger.error(f"{self.log_prefix} 豆包API请求超时 (配置超时: {timeout}秒)")
|
||||
return TTSResult(False, "豆包语音API调用超时", backend_name=self.backend_name)
|
||||
except Exception as e:
|
||||
logger.error(f"{self.log_prefix} 豆包语音执行异常: {e}")
|
||||
return TTSResult(False, f"豆包语音执行错误: {e}", backend_name=self.backend_name)
|
||||
|
|
@ -0,0 +1,432 @@
|
|||
"""
|
||||
豆包语音流式响应解析器
|
||||
基于官方示例实现,确保兼容性和正确性
|
||||
|
||||
官方API说明:
|
||||
- code=0: 继续处理,可能包含 "data"(音频)或 "sentence"(文本)
|
||||
- code=20000000: 结束标志,可能包含 "usage"(用量统计)
|
||||
- code>0: 错误响应
|
||||
"""
|
||||
|
||||
import json
|
||||
import base64
|
||||
from typing import Tuple, Optional, List
|
||||
from src.common.logger import get_logger
|
||||
|
||||
logger = get_logger("doubao_stream_parser")
|
||||
|
||||
|
||||
class DoubaoStreamParser:
|
||||
"""
|
||||
豆包语音流式响应解析器
|
||||
|
||||
基于官方API实现,忠实还原官方示例逻辑。
|
||||
处理流程:
|
||||
1. 逐行读取 JSON 响应
|
||||
2. 检查状态码:code=0(继续), code=20000000(结束), code>0(错误)
|
||||
3. 提取音频数据(code=0 且有 "data" 字段)
|
||||
4. 记录日志(code=0 且有 "sentence" 字段)
|
||||
"""
|
||||
|
||||
def __init__(self, log_prefix: str = "[DoubaoParser]"):
|
||||
"""
|
||||
初始化解析器
|
||||
|
||||
Args:
|
||||
log_prefix: 日志前缀
|
||||
"""
|
||||
self.log_prefix = log_prefix
|
||||
self._audio_chunks: List[bytes] = []
|
||||
self._buffer: bytes = b''
|
||||
self._line_count: int = 0
|
||||
self._total_bytes: int = 0
|
||||
self._error_message: Optional[str] = None
|
||||
self._finished: bool = False # 是否收到结束信号
|
||||
self._usage_info: Optional[dict] = None
|
||||
|
||||
def _decode_audio_from_base64(self, audio_base64: str) -> Optional[bytes]:
|
||||
"""
|
||||
从 Base64 字符串解码音频数据
|
||||
|
||||
官方示例中直接使用 base64.b64decode(data["data"]),
|
||||
但我们添加了额外的容错和验证。
|
||||
|
||||
Args:
|
||||
audio_base64: Base64 编码的音频数据
|
||||
|
||||
Returns:
|
||||
解码后的音频字节数据或 None
|
||||
"""
|
||||
if not audio_base64:
|
||||
return None
|
||||
|
||||
try:
|
||||
# 官方示例直接调用 base64.b64decode()
|
||||
# 这里添加容错处理:补充填充符(如果需要)
|
||||
padding_needed = len(audio_base64) % 4
|
||||
if padding_needed:
|
||||
audio_base64 += '=' * (4 - padding_needed)
|
||||
logger.debug(
|
||||
f"{self.log_prefix} Base64填充已应用 "
|
||||
f"(原长: {len(audio_base64) - (4 - padding_needed)}, 新长: {len(audio_base64)})"
|
||||
)
|
||||
|
||||
audio_bytes = base64.b64decode(audio_base64)
|
||||
|
||||
if not audio_bytes:
|
||||
logger.warning(f"{self.log_prefix} Base64解码结果为空")
|
||||
return None
|
||||
|
||||
logger.debug(
|
||||
f"{self.log_prefix} 音频块解码成功 - 大小: {len(audio_bytes)}字节"
|
||||
)
|
||||
return audio_bytes
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"{self.log_prefix} Base64解码失败: {e} "
|
||||
f"(Base64长度: {len(audio_base64)})"
|
||||
)
|
||||
return None
|
||||
|
||||
def _process_json_line(self, line_str: str) -> Optional[str]:
|
||||
"""
|
||||
处理单行 JSON 数据
|
||||
|
||||
严格按照官方示例逻辑:
|
||||
1. 检查 code 字段
|
||||
2. code=0 且有 data → 提取音频
|
||||
3. code=0 且有 sentence → 记录文本(可选)
|
||||
4. code=20000000 → 收到结束信号
|
||||
5. code>0 → 错误
|
||||
|
||||
Args:
|
||||
line_str: JSON 字符串
|
||||
|
||||
Returns:
|
||||
如果收到结束信号,返回 "END";如果发生错误,返回错误信息;否则返回 None
|
||||
"""
|
||||
try:
|
||||
json_obj = json.loads(line_str)
|
||||
except json.JSONDecodeError as e:
|
||||
logger.debug(f"{self.log_prefix} JSON解析失败: {e}")
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.warning(f"{self.log_prefix} JSON处理异常: {e}")
|
||||
return None
|
||||
|
||||
if not isinstance(json_obj, dict):
|
||||
logger.debug(
|
||||
f"{self.log_prefix} 收到非字典JSON对象: {type(json_obj).__name__}"
|
||||
)
|
||||
return None
|
||||
|
||||
code = json_obj.get("code", -1)
|
||||
|
||||
# ✅ 官方逻辑:处理 code=0 的数据帧
|
||||
if code == 0:
|
||||
# 检查是否有音频数据
|
||||
if "data" in json_obj and json_obj["data"]:
|
||||
chunk_audio = self._decode_audio_from_base64(json_obj["data"])
|
||||
if chunk_audio:
|
||||
self._audio_chunks.append(chunk_audio)
|
||||
logger.debug(
|
||||
f"{self.log_prefix} 音频块#{len(self._audio_chunks)} 已接收 "
|
||||
f"(大小: {len(chunk_audio)}字节)"
|
||||
)
|
||||
|
||||
# 检查是否有文本/句子信息(可选)
|
||||
if "sentence" in json_obj and json_obj["sentence"]:
|
||||
sentence_data = json_obj.get("sentence", {})
|
||||
logger.debug(
|
||||
f"{self.log_prefix} 收到句子数据: {sentence_data}"
|
||||
)
|
||||
|
||||
return None # 继续处理
|
||||
|
||||
# ✅ 官方逻辑:处理 code=20000000 的结束帧
|
||||
elif code == 20000000:
|
||||
logger.info(f"{self.log_prefix} 收到流结束信号 (code=20000000)")
|
||||
|
||||
# 记录用量信息(如果有)
|
||||
if "usage" in json_obj:
|
||||
self._usage_info = json_obj["usage"]
|
||||
logger.info(
|
||||
f"{self.log_prefix} 豆包用量信息: {self._usage_info}"
|
||||
)
|
||||
|
||||
self._finished = True
|
||||
return "END" # 表示流已结束
|
||||
|
||||
# ✅ 官方逻辑:错误处理
|
||||
elif code and code > 0:
|
||||
error_msg = json_obj.get("message", f"未知错误 (code={code})")
|
||||
logger.error(
|
||||
f"{self.log_prefix} 豆包语音API返回错误 "
|
||||
f"(code={code}): {error_msg}"
|
||||
)
|
||||
self._error_message = error_msg
|
||||
return error_msg # 返回错误信息
|
||||
|
||||
# 未知状态码
|
||||
else:
|
||||
logger.debug(
|
||||
f"{self.log_prefix} 收到未知状态码: code={code}"
|
||||
)
|
||||
return None
|
||||
|
||||
def _find_data_chunk_offset(self, header: bytes) -> int:
|
||||
"""
|
||||
在 WAV header 中查找 'data' 块的位置
|
||||
|
||||
豆包返回的 WAV 可能包含额外的元数据块(如 LIST/INFO),
|
||||
导致 'data' 块不在标准的 44 字节位置。
|
||||
|
||||
Args:
|
||||
header: WAV 文件头部数据
|
||||
|
||||
Returns:
|
||||
data 块数据开始的位置(即 'data' + 4字节大小之后)
|
||||
"""
|
||||
pos = 12 # 跳过 RIFF(4) + size(4) + WAVE(4)
|
||||
|
||||
while pos < len(header) - 8:
|
||||
chunk_id = header[pos:pos+4]
|
||||
chunk_size = int.from_bytes(header[pos+4:pos+8], 'little')
|
||||
|
||||
if chunk_id == b'data':
|
||||
return pos + 8 # 返回音频数据开始位置
|
||||
|
||||
# 移动到下一个块
|
||||
pos += 8 + chunk_size
|
||||
# WAV 块需要对齐到偶数字节
|
||||
if chunk_size % 2 == 1:
|
||||
pos += 1
|
||||
|
||||
# 未找到 data 块,返回默认值
|
||||
return 44
|
||||
|
||||
def _merge_audio_chunks(self, chunks: List[bytes]) -> bytes:
|
||||
"""
|
||||
合并音频块,处理 WAV 格式的流式响应
|
||||
|
||||
豆包流式 WAV 响应特点:
|
||||
1. 第一个块包含完整 header(可能 > 44 字节,含 LIST/INFO 元数据)
|
||||
2. header 中的大小字段是 0xFFFFFFFF(流式占位符)
|
||||
3. 后续块是纯音频数据(无 header)
|
||||
4. 需要在合并后修正大小字段
|
||||
|
||||
Args:
|
||||
chunks: 音频数据块列表
|
||||
|
||||
Returns:
|
||||
合并后的有效 WAV 文件
|
||||
"""
|
||||
if not chunks:
|
||||
return b''
|
||||
|
||||
first_chunk = chunks[0]
|
||||
|
||||
# 检查是否是 WAV 格式(RIFF header)
|
||||
if len(first_chunk) < 44 or first_chunk[:4] != b'RIFF':
|
||||
# 不是 WAV 格式(如 MP3),直接拼接
|
||||
return b''.join(chunks)
|
||||
|
||||
# 查找 data 块的实际位置
|
||||
data_offset = self._find_data_chunk_offset(first_chunk)
|
||||
logger.debug(f"{self.log_prefix} WAV data 块偏移: {data_offset} 字节")
|
||||
|
||||
# 提取 header 和第一块的音频数据
|
||||
header = bytearray(first_chunk[:data_offset])
|
||||
data_parts = [first_chunk[data_offset:]]
|
||||
skipped_headers = 0
|
||||
|
||||
# 处理后续块
|
||||
for chunk in chunks[1:]:
|
||||
if len(chunk) > 44 and chunk[:4] == b'RIFF':
|
||||
# 后续块也有 RIFF header,需要跳过
|
||||
chunk_data_offset = self._find_data_chunk_offset(chunk)
|
||||
data_parts.append(chunk[chunk_data_offset:])
|
||||
skipped_headers += 1
|
||||
else:
|
||||
# 纯音频数据
|
||||
data_parts.append(chunk)
|
||||
|
||||
# 合并所有音频数据
|
||||
audio_data = b''.join(data_parts)
|
||||
audio_size = len(audio_data)
|
||||
|
||||
# 修正 WAV header 中的大小字段
|
||||
# 字节 4-7: 文件总大小 - 8 = (header_size - 8) + audio_size
|
||||
file_size = len(header) - 8 + audio_size
|
||||
header[4:8] = file_size.to_bytes(4, 'little')
|
||||
|
||||
# 修正 data 块的大小字段(位于 data_offset - 4 处)
|
||||
header[data_offset-4:data_offset] = audio_size.to_bytes(4, 'little')
|
||||
|
||||
if skipped_headers > 0 or audio_size > 0:
|
||||
logger.info(
|
||||
f"{self.log_prefix} WAV 流式合并完成: "
|
||||
f"header={len(header)}字节, 音频={audio_size}字节, "
|
||||
f"跳过重复header={skipped_headers}"
|
||||
)
|
||||
|
||||
return bytes(header) + audio_data
|
||||
|
||||
def feed_chunk(self, chunk: bytes) -> Optional[str]:
|
||||
"""
|
||||
输入一块数据
|
||||
|
||||
Args:
|
||||
chunk: 网络数据块
|
||||
|
||||
Returns:
|
||||
如果遇到错误或结束,返回相应信息;否则返回 None
|
||||
"""
|
||||
if not chunk:
|
||||
return None
|
||||
|
||||
self._buffer += chunk
|
||||
self._total_bytes += len(chunk)
|
||||
|
||||
# 按行处理(官方示例使用 iter_lines)
|
||||
while b'\n' in self._buffer:
|
||||
line_bytes, self._buffer = self._buffer.split(b'\n', 1)
|
||||
|
||||
# 尝试解码行数据
|
||||
try:
|
||||
line_str = line_bytes.decode('utf-8', errors='replace').strip()
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"{self.log_prefix} 行解码失败: {e}, 跳过该行"
|
||||
)
|
||||
self._line_count += 1
|
||||
continue
|
||||
|
||||
if not line_str:
|
||||
continue
|
||||
|
||||
self._line_count += 1
|
||||
|
||||
# 处理该行
|
||||
result = self._process_json_line(line_str)
|
||||
|
||||
# 如果收到结束信号或错误,立即返回
|
||||
if result == "END":
|
||||
return None # 正常结束
|
||||
elif result: # 返回的是错误信息
|
||||
return result
|
||||
|
||||
return None
|
||||
|
||||
def finalize(self) -> Tuple[Optional[bytes], Optional[str]]:
|
||||
"""
|
||||
完成解析,处理剩余数据
|
||||
|
||||
Returns:
|
||||
(audio_data, error_message)
|
||||
- audio_data: 合并后的音频数据(成功时)
|
||||
- error_message: 错误信息(失败时)
|
||||
"""
|
||||
# 处理剩余的 buffer 中的最后一行
|
||||
if self._buffer.strip():
|
||||
try:
|
||||
line_str = self._buffer.decode('utf-8', errors='replace').strip()
|
||||
if line_str:
|
||||
logger.debug(
|
||||
f"{self.log_prefix} 处理最后的buffer数据 "
|
||||
f"(长度: {len(line_str)}字符)"
|
||||
)
|
||||
result = self._process_json_line(line_str)
|
||||
if result and result != "END":
|
||||
# 最后的 buffer 包含错误
|
||||
self._error_message = result
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"{self.log_prefix} 最后buffer解析异常: {e}"
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"{self.log_prefix} 豆包流解析完成 - "
|
||||
f"处理行数: {self._line_count}, "
|
||||
f"音频块数: {len(self._audio_chunks)}, "
|
||||
f"接收字节数: {self._total_bytes}, "
|
||||
f"正常结束: {self._finished}"
|
||||
)
|
||||
|
||||
# 检查是否有错误
|
||||
if self._error_message:
|
||||
logger.error(
|
||||
f"{self.log_prefix} 豆包API返回错误: {self._error_message}"
|
||||
)
|
||||
return None, f"豆包语音API错误: {self._error_message}"
|
||||
|
||||
# 检查是否有音频数据
|
||||
if not self._audio_chunks:
|
||||
if self._total_bytes == 0:
|
||||
logger.warning(
|
||||
f"{self.log_prefix} 豆包API未返回任何数据"
|
||||
)
|
||||
return None, "未收到任何响应数据"
|
||||
|
||||
logger.warning(
|
||||
f"{self.log_prefix} 收到 {self._total_bytes} 字节数据但无音频块"
|
||||
)
|
||||
return None, "豆包语音未返回任何音频数据"
|
||||
|
||||
# ✅ 额外的数据完整性检查
|
||||
# 过滤掉过小的块(可能是损坏或无效的)
|
||||
min_chunk_size = 50 # 最小块大小
|
||||
valid_chunks = [
|
||||
chunk for chunk in self._audio_chunks
|
||||
if len(chunk) >= min_chunk_size
|
||||
]
|
||||
|
||||
if not valid_chunks:
|
||||
logger.error(
|
||||
f"{self.log_prefix} 所有音频块都太小 (可能是损坏的数据)"
|
||||
)
|
||||
logger.debug(
|
||||
f"{self.log_prefix} 块大小分布: {[len(c) for c in self._audio_chunks]}"
|
||||
)
|
||||
return None, "音频数据不完整或已损坏"
|
||||
|
||||
# 合并所有有效的音频数据(处理 WAV 多 header 问题)
|
||||
merged_audio = self._merge_audio_chunks(valid_chunks)
|
||||
|
||||
logger.info(
|
||||
f"{self.log_prefix} 音频合并完成 - "
|
||||
f"有效块数: {len(valid_chunks)}/{len(self._audio_chunks)}, "
|
||||
f"总大小: {len(merged_audio)}字节"
|
||||
)
|
||||
|
||||
return merged_audio, None
|
||||
|
||||
@classmethod
|
||||
async def parse_response(
|
||||
cls,
|
||||
response,
|
||||
log_prefix: str = "[DoubaoParser]"
|
||||
) -> Tuple[Optional[bytes], Optional[str]]:
|
||||
"""
|
||||
解析豆包 API 的流式响应
|
||||
|
||||
Args:
|
||||
response: aiohttp 响应对象
|
||||
log_prefix: 日志前缀
|
||||
|
||||
Returns:
|
||||
(audio_data, error_message)
|
||||
"""
|
||||
parser = cls(log_prefix)
|
||||
|
||||
# 逐块读取响应流
|
||||
async for chunk in response.content.iter_any():
|
||||
result = parser.feed_chunk(chunk)
|
||||
|
||||
# 如果遇到错误,立即返回
|
||||
if result and result != "END":
|
||||
return None, result
|
||||
|
||||
# 完成解析,处理剩余数据
|
||||
return parser.finalize()
|
||||
|
|
@ -0,0 +1,326 @@
|
|||
"""
|
||||
GPT-SoVITS 后端实现
|
||||
使用本地 GPT-SoVITS 服务进行语音合成
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from typing import Optional, Dict, Any, Tuple, ClassVar
|
||||
from .base import TTSBackendBase, TTSResult
|
||||
from ..utils.text import TTSTextUtils
|
||||
from ..utils.file import TTSFileManager
|
||||
from ..utils.session import TTSSessionManager
|
||||
from ..config_keys import ConfigKeys
|
||||
from src.common.logger import get_logger
|
||||
|
||||
logger = get_logger("tts_gpt_sovits")
|
||||
|
||||
|
||||
class GPTSoVITSBackend(TTSBackendBase):
|
||||
"""
|
||||
GPT-SoVITS 后端
|
||||
|
||||
使用本地 GPT-SoVITS 服务进行高度定制化的语音合成
|
||||
支持动态切换 GPT 和 SoVITS 模型权重
|
||||
"""
|
||||
|
||||
backend_name = "gpt_sovits"
|
||||
backend_description = "本地GPT-SoVITS服务"
|
||||
support_private_chat = True
|
||||
default_audio_format = "mp3"
|
||||
|
||||
# 类变量:记录当前加载的模型路径,避免重复切换
|
||||
_current_gpt_weights: ClassVar[Optional[str]] = None
|
||||
_current_sovits_weights: ClassVar[Optional[str]] = None
|
||||
|
||||
def get_default_voice(self) -> str:
|
||||
"""获取默认风格"""
|
||||
return "default"
|
||||
|
||||
async def _switch_model(
|
||||
self,
|
||||
server: str,
|
||||
gpt_weights: Optional[str],
|
||||
sovits_weights: Optional[str],
|
||||
timeout: int
|
||||
) -> Tuple[bool, str]:
|
||||
"""
|
||||
切换 GPT-SoVITS 模型权重
|
||||
|
||||
Args:
|
||||
server: 服务器地址
|
||||
gpt_weights: GPT 模型权重路径
|
||||
sovits_weights: SoVITS 模型权重路径
|
||||
timeout: 超时时间
|
||||
|
||||
Returns:
|
||||
(success, error_message)
|
||||
"""
|
||||
session_manager = await TTSSessionManager.get_instance()
|
||||
|
||||
async def _set_model_v1() -> Tuple[bool, str]:
|
||||
# 兼容旧版 api.py: 仅支持 /set_model 同时切换
|
||||
if not gpt_weights or not sovits_weights:
|
||||
return False, "当前GPT-SoVITS服务不支持单独切换模型(请同时配置GPT与SoVITS权重)"
|
||||
set_model_url = (
|
||||
f"{server.rstrip('/')}/set_model?"
|
||||
f"gpt_model_path={gpt_weights}&sovits_model_path={sovits_weights}"
|
||||
)
|
||||
logger.info(f"{self.log_prefix} 切换模型(兼容模式): {gpt_weights} | {sovits_weights}")
|
||||
try:
|
||||
async with session_manager.get(
|
||||
set_model_url,
|
||||
backend_name="gpt_sovits",
|
||||
timeout=timeout
|
||||
) as response:
|
||||
if response.status == 200:
|
||||
GPTSoVITSBackend._current_gpt_weights = gpt_weights
|
||||
GPTSoVITSBackend._current_sovits_weights = sovits_weights
|
||||
logger.info(f"{self.log_prefix} 模型切换成功(兼容模式)")
|
||||
return True, ""
|
||||
error_text = await response.text()
|
||||
return False, f"模型切换失败: {error_text}"
|
||||
except Exception as e:
|
||||
return False, f"模型切换异常: {e}"
|
||||
|
||||
# 切换 GPT 权重
|
||||
if gpt_weights and gpt_weights != GPTSoVITSBackend._current_gpt_weights:
|
||||
gpt_url = f"{server.rstrip('/')}/set_gpt_weights?weights_path={gpt_weights}"
|
||||
logger.info(f"{self.log_prefix} 切换GPT模型: {gpt_weights}")
|
||||
|
||||
try:
|
||||
async with session_manager.get(
|
||||
gpt_url,
|
||||
backend_name="gpt_sovits",
|
||||
timeout=timeout
|
||||
) as response:
|
||||
if response.status == 200:
|
||||
GPTSoVITSBackend._current_gpt_weights = gpt_weights
|
||||
logger.info(f"{self.log_prefix} GPT模型切换成功")
|
||||
elif response.status == 404:
|
||||
# 旧版服务没有 /set_gpt_weights
|
||||
return await _set_model_v1()
|
||||
else:
|
||||
error_text = await response.text()
|
||||
return False, f"GPT模型切换失败: {error_text}"
|
||||
except Exception as e:
|
||||
return False, f"GPT模型切换异常: {e}"
|
||||
|
||||
# 切换 SoVITS 权重
|
||||
if sovits_weights and sovits_weights != GPTSoVITSBackend._current_sovits_weights:
|
||||
sovits_url = f"{server.rstrip('/')}/set_sovits_weights?weights_path={sovits_weights}"
|
||||
logger.info(f"{self.log_prefix} 切换SoVITS模型: {sovits_weights}")
|
||||
|
||||
try:
|
||||
async with session_manager.get(
|
||||
sovits_url,
|
||||
backend_name="gpt_sovits",
|
||||
timeout=timeout
|
||||
) as response:
|
||||
if response.status == 200:
|
||||
GPTSoVITSBackend._current_sovits_weights = sovits_weights
|
||||
logger.info(f"{self.log_prefix} SoVITS模型切换成功")
|
||||
elif response.status == 404:
|
||||
# 旧版服务没有 /set_sovits_weights
|
||||
return await _set_model_v1()
|
||||
else:
|
||||
error_text = await response.text()
|
||||
return False, f"SoVITS模型切换失败: {error_text}"
|
||||
except Exception as e:
|
||||
return False, f"SoVITS模型切换异常: {e}"
|
||||
|
||||
return True, ""
|
||||
|
||||
def _normalize_styles_config(self, styles_config: Any) -> Dict[str, Any]:
|
||||
"""
|
||||
规范化风格配置格式
|
||||
|
||||
支持两种格式:
|
||||
1. 旧格式(字典): {"default": {...}, "happy": {...}}
|
||||
2. 新格式(数组): [{"name": "default", ...}, {"name": "happy", ...}]
|
||||
|
||||
统一转换为字典格式供内部使用
|
||||
"""
|
||||
# 如果是字典格式(旧格式),直接返回
|
||||
if isinstance(styles_config, dict):
|
||||
return styles_config
|
||||
|
||||
# 如果是数组格式(新格式),转换为字典
|
||||
if isinstance(styles_config, list):
|
||||
result = {}
|
||||
for style in styles_config:
|
||||
if isinstance(style, dict) and "name" in style:
|
||||
style_name = style["name"]
|
||||
# 复制配置,移除 name 字段
|
||||
style_data = {k: v for k, v in style.items() if k != "name"}
|
||||
result[style_name] = style_data
|
||||
return result
|
||||
|
||||
# 其他情况返回空字典
|
||||
return {}
|
||||
|
||||
def validate_config(self) -> Tuple[bool, str]:
|
||||
"""验证配置"""
|
||||
styles_raw = self.get_config(ConfigKeys.GPT_SOVITS_STYLES, {})
|
||||
styles = self._normalize_styles_config(styles_raw)
|
||||
|
||||
if not styles or "default" not in styles:
|
||||
return False, "GPT-SoVITS未配置任何语音风格"
|
||||
|
||||
default_style = styles.get("default", {})
|
||||
if not default_style.get("refer_wav") or not default_style.get("prompt_text"):
|
||||
return False, "GPT-SoVITS默认风格配置不完整(需要refer_wav和prompt_text)"
|
||||
|
||||
return True, ""
|
||||
|
||||
async def execute(
|
||||
self,
|
||||
text: str,
|
||||
voice: Optional[str] = None,
|
||||
**kwargs
|
||||
) -> TTSResult:
|
||||
"""
|
||||
执行GPT-SoVITS语音合成
|
||||
|
||||
Args:
|
||||
text: 待转换的文本
|
||||
voice: 风格名称
|
||||
|
||||
Returns:
|
||||
TTSResult
|
||||
"""
|
||||
# 验证文本
|
||||
if not text or not text.strip():
|
||||
return TTSResult(False, "待合成的文本为空", backend_name=self.backend_name)
|
||||
|
||||
# 获取配置
|
||||
server = self.get_config(ConfigKeys.GPT_SOVITS_SERVER, "http://127.0.0.1:9880")
|
||||
styles_raw = self.get_config(ConfigKeys.GPT_SOVITS_STYLES, {})
|
||||
styles = self._normalize_styles_config(styles_raw)
|
||||
timeout = self.get_config(ConfigKeys.GENERAL_TIMEOUT, 60)
|
||||
|
||||
# 确定使用的风格
|
||||
voice_style = voice if voice and voice in styles else "default"
|
||||
|
||||
if voice_style not in styles:
|
||||
return TTSResult(
|
||||
False,
|
||||
f"GPT-SoVITS风格 '{voice_style}' 未配置",
|
||||
backend_name=self.backend_name
|
||||
)
|
||||
|
||||
style_config = styles[voice_style]
|
||||
refer_wav_path = style_config.get("refer_wav", "")
|
||||
prompt_text = style_config.get("prompt_text", "")
|
||||
prompt_language = style_config.get("prompt_language", "zh")
|
||||
gpt_weights = style_config.get("gpt_weights")
|
||||
sovits_weights = style_config.get("sovits_weights")
|
||||
|
||||
if not refer_wav_path or not prompt_text:
|
||||
return TTSResult(
|
||||
False,
|
||||
f"GPT-SoVITS风格 '{voice_style}' 配置不完整",
|
||||
backend_name=self.backend_name
|
||||
)
|
||||
|
||||
# 如果配置了模型权重,先切换模型
|
||||
if gpt_weights or sovits_weights:
|
||||
switch_success, switch_error = await self._switch_model(
|
||||
server, gpt_weights, sovits_weights, timeout
|
||||
)
|
||||
if not switch_success:
|
||||
return TTSResult(False, switch_error, backend_name=self.backend_name)
|
||||
|
||||
# 检测文本语言
|
||||
text_language = TTSTextUtils.detect_language(text)
|
||||
|
||||
# 构建请求数据
|
||||
data = {
|
||||
"text": text,
|
||||
"text_lang": text_language,
|
||||
"ref_audio_path": refer_wav_path,
|
||||
"prompt_text": prompt_text,
|
||||
"prompt_lang": prompt_language
|
||||
}
|
||||
|
||||
tts_url = f"{server.rstrip('/')}/tts"
|
||||
legacy_tts_url = f"{server.rstrip('/')}/"
|
||||
legacy_data = {
|
||||
"text": text,
|
||||
"text_language": text_language,
|
||||
"refer_wav_path": refer_wav_path,
|
||||
"prompt_text": prompt_text,
|
||||
"prompt_language": prompt_language,
|
||||
}
|
||||
|
||||
logger.info(f"{self.log_prefix} GPT-SoVITS请求: text='{text[:50]}...', style={voice_style}")
|
||||
|
||||
try:
|
||||
session_manager = await TTSSessionManager.get_instance()
|
||||
async with session_manager.post(
|
||||
tts_url,
|
||||
json=data,
|
||||
backend_name="gpt_sovits",
|
||||
timeout=timeout
|
||||
) as response:
|
||||
if response.status == 200:
|
||||
audio_data = await response.read()
|
||||
|
||||
# 验证音频数据
|
||||
is_valid, error_msg = TTSFileManager.validate_audio_data(audio_data)
|
||||
if not is_valid:
|
||||
return TTSResult(False, f"GPT-SoVITS{error_msg}", backend_name=self.backend_name)
|
||||
|
||||
# 使用统一的发送方法
|
||||
return await self.send_audio(
|
||||
audio_data=audio_data,
|
||||
audio_format="wav",
|
||||
prefix="tts_gpt_sovits",
|
||||
voice_info=f"风格: {voice_style}"
|
||||
)
|
||||
elif response.status == 404:
|
||||
# 兼容旧版 api.py:没有 /tts 端点,回退到根路径
|
||||
logger.warning(f"{self.log_prefix} /tts 端点不存在,尝试兼容模式请求根路径")
|
||||
else:
|
||||
error_info = await response.text()
|
||||
logger.error(f"{self.log_prefix} GPT-SoVITS API失败[{response.status}]: {error_info[:200]}")
|
||||
return TTSResult(
|
||||
False,
|
||||
f"GPT-SoVITS API调用失败: {response.status}",
|
||||
backend_name=self.backend_name
|
||||
)
|
||||
|
||||
# 仅在 /tts 404 时回退到旧版根路径
|
||||
async with session_manager.post(
|
||||
legacy_tts_url,
|
||||
json=legacy_data,
|
||||
backend_name="gpt_sovits",
|
||||
timeout=timeout
|
||||
) as response:
|
||||
if response.status == 200:
|
||||
audio_data = await response.read()
|
||||
|
||||
# 验证音频数据
|
||||
is_valid, error_msg = TTSFileManager.validate_audio_data(audio_data)
|
||||
if not is_valid:
|
||||
return TTSResult(False, f"GPT-SoVITS{error_msg}", backend_name=self.backend_name)
|
||||
|
||||
return await self.send_audio(
|
||||
audio_data=audio_data,
|
||||
audio_format="wav",
|
||||
prefix="tts_gpt_sovits",
|
||||
voice_info=f"风格: {voice_style}"
|
||||
)
|
||||
else:
|
||||
error_info = await response.text()
|
||||
logger.error(f"{self.log_prefix} GPT-SoVITS API失败[{response.status}]: {error_info[:200]}")
|
||||
return TTSResult(
|
||||
False,
|
||||
f"GPT-SoVITS API调用失败: {response.status}",
|
||||
backend_name=self.backend_name
|
||||
)
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
return TTSResult(False, "GPT-SoVITS API调用超时", backend_name=self.backend_name)
|
||||
except Exception as e:
|
||||
logger.error(f"{self.log_prefix} GPT-SoVITS执行错误: {e}")
|
||||
return TTSResult(False, f"GPT-SoVITS执行错误: {e}", backend_name=self.backend_name)
|
||||
|
|
@ -0,0 +1,186 @@
|
|||
"""
|
||||
GSV2P 后端实现
|
||||
使用 GSV2P 云端 API 进行语音合成
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
from typing import Optional, Dict, Any, Tuple
|
||||
from .base import TTSBackendBase, TTSResult
|
||||
from ..utils.file import TTSFileManager
|
||||
from ..utils.session import TTSSessionManager
|
||||
from ..config_keys import ConfigKeys
|
||||
from src.common.logger import get_logger
|
||||
|
||||
logger = get_logger("tts_gsv2p")
|
||||
|
||||
# 重试配置
|
||||
MAX_RETRIES = 5 # 最大重试次数
|
||||
RETRY_DELAY = 3.0 # 重试间隔(秒)
|
||||
|
||||
|
||||
class GSV2PBackend(TTSBackendBase):
|
||||
"""
|
||||
GSV2P 后端
|
||||
|
||||
使用 GSV2P 云端 API 进行高质量语音合成
|
||||
"""
|
||||
|
||||
backend_name = "gsv2p"
|
||||
backend_description = "GSV2P云端API语音合成"
|
||||
support_private_chat = True
|
||||
default_audio_format = "mp3"
|
||||
|
||||
def get_default_voice(self) -> str:
|
||||
"""获取默认音色"""
|
||||
return self.get_config(ConfigKeys.GSV2P_DEFAULT_VOICE, "原神-中文-派蒙_ZH")
|
||||
|
||||
def validate_config(self) -> Tuple[bool, str]:
|
||||
"""验证配置"""
|
||||
api_token = self.get_config(ConfigKeys.GSV2P_API_TOKEN, "")
|
||||
if not api_token:
|
||||
return False, "GSV2P后端缺少API Token配置"
|
||||
return True, ""
|
||||
|
||||
async def _make_request(
|
||||
self,
|
||||
api_url: str,
|
||||
request_data: Dict[str, Any],
|
||||
headers: Dict[str, str],
|
||||
timeout: int
|
||||
) -> Tuple[bool, Any, str]:
|
||||
"""
|
||||
发送单次API请求
|
||||
|
||||
Returns:
|
||||
(成功标志, 音频数据或None, 错误信息)
|
||||
"""
|
||||
session_manager = await TTSSessionManager.get_instance()
|
||||
async with session_manager.post(
|
||||
api_url,
|
||||
json=request_data,
|
||||
headers=headers,
|
||||
backend_name="gsv2p",
|
||||
timeout=timeout
|
||||
) as response:
|
||||
if response.status == 200:
|
||||
content_type = response.headers.get('Content-Type', '')
|
||||
audio_data = await response.read()
|
||||
|
||||
# 检查是否返回了JSON错误(服务端不稳定时会返回参数错误)
|
||||
if 'application/json' in content_type:
|
||||
try:
|
||||
error_json = json.loads(audio_data.decode('utf-8'))
|
||||
error_msg = error_json.get('error', {}).get('message', str(error_json))
|
||||
# 参数错误通常是服务端临时问题,可以重试
|
||||
return False, None, f"API返回错误: {error_msg}"
|
||||
except Exception:
|
||||
return False, None, "API返回异常响应"
|
||||
|
||||
# 验证音频数据
|
||||
is_valid, error_msg = TTSFileManager.validate_audio_data(audio_data)
|
||||
if not is_valid:
|
||||
return False, None, f"音频数据无效: {error_msg}"
|
||||
|
||||
return True, audio_data, ""
|
||||
else:
|
||||
error_text = await response.text()
|
||||
return False, None, f"API调用失败: {response.status} - {error_text[:100]}"
|
||||
|
||||
async def execute(
|
||||
self,
|
||||
text: str,
|
||||
voice: Optional[str] = None,
|
||||
**kwargs
|
||||
) -> TTSResult:
|
||||
"""
|
||||
执行GSV2P语音合成(带重试机制)
|
||||
|
||||
Args:
|
||||
text: 待转换的文本
|
||||
voice: 音色名称
|
||||
|
||||
Returns:
|
||||
TTSResult
|
||||
"""
|
||||
# 验证配置
|
||||
is_valid, error_msg = self.validate_config()
|
||||
if not is_valid:
|
||||
return TTSResult(False, error_msg, backend_name=self.backend_name)
|
||||
|
||||
# 验证文本
|
||||
if not text or not text.strip():
|
||||
return TTSResult(False, "待合成的文本为空", backend_name=self.backend_name)
|
||||
|
||||
# 获取配置
|
||||
api_url = self.get_config(ConfigKeys.GSV2P_API_URL, "https://gsv2p.acgnai.top/v1/audio/speech")
|
||||
api_token = self.get_config(ConfigKeys.GSV2P_API_TOKEN, "")
|
||||
timeout = self.get_config(ConfigKeys.GSV2P_TIMEOUT, 30)
|
||||
|
||||
if not voice:
|
||||
voice = self.get_default_voice()
|
||||
|
||||
# 构建请求参数(注意:other_params 已被 API 废弃,不再支持)
|
||||
request_data: Dict[str, Any] = {
|
||||
"model": self.get_config(ConfigKeys.GSV2P_MODEL, "tts-v4"),
|
||||
"input": text,
|
||||
"voice": voice,
|
||||
"response_format": self.get_config(ConfigKeys.GSV2P_RESPONSE_FORMAT, "mp3"),
|
||||
"speed": self.get_config(ConfigKeys.GSV2P_SPEED, 1)
|
||||
}
|
||||
|
||||
headers = {
|
||||
"accept": "application/json",
|
||||
"Authorization": f"Bearer {api_token}",
|
||||
"Content-Type": "application/json"
|
||||
}
|
||||
|
||||
logger.info(f"{self.log_prefix} GSV2P请求: text='{text[:50]}...', voice={voice}")
|
||||
logger.debug(f"{self.log_prefix} GSV2P完整请求参数: {json.dumps(request_data, ensure_ascii=False, indent=2)}")
|
||||
|
||||
last_error = ""
|
||||
for attempt in range(1, MAX_RETRIES + 1):
|
||||
try:
|
||||
success, audio_data, error_msg = await self._make_request(
|
||||
api_url, request_data, headers, timeout
|
||||
)
|
||||
|
||||
if success and audio_data:
|
||||
if attempt > 1:
|
||||
logger.info(f"{self.log_prefix} GSV2P第{attempt}次重试成功")
|
||||
|
||||
logger.info(f"{self.log_prefix} GSV2P响应: 数据大小={len(audio_data)}字节")
|
||||
|
||||
# 使用统一的发送方法
|
||||
audio_format = self.get_config(ConfigKeys.GSV2P_RESPONSE_FORMAT, "mp3")
|
||||
return await self.send_audio(
|
||||
audio_data=audio_data,
|
||||
audio_format=audio_format,
|
||||
prefix="tts_gsv2p",
|
||||
voice_info=f"音色: {voice}"
|
||||
)
|
||||
else:
|
||||
last_error = error_msg
|
||||
if attempt < MAX_RETRIES:
|
||||
logger.warning(f"{self.log_prefix} GSV2P请求失败 ({error_msg}), {RETRY_DELAY}秒后重试 (尝试 {attempt}/{MAX_RETRIES})")
|
||||
await asyncio.sleep(RETRY_DELAY)
|
||||
else:
|
||||
logger.error(f"{self.log_prefix} GSV2P请求失败,已达最大重试次数: {error_msg}")
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
last_error = "API调用超时"
|
||||
if attempt < MAX_RETRIES:
|
||||
logger.warning(f"{self.log_prefix} GSV2P超时, {RETRY_DELAY}秒后重试 (尝试 {attempt}/{MAX_RETRIES})")
|
||||
await asyncio.sleep(RETRY_DELAY)
|
||||
else:
|
||||
logger.error(f"{self.log_prefix} GSV2P超时,已达最大重试次数")
|
||||
|
||||
except Exception as e:
|
||||
last_error = str(e)
|
||||
logger.error(f"{self.log_prefix} GSV2P执行错误: {e}")
|
||||
if attempt < MAX_RETRIES:
|
||||
await asyncio.sleep(RETRY_DELAY)
|
||||
else:
|
||||
break
|
||||
|
||||
return TTSResult(False, f"GSV2P {last_error} (已重试{MAX_RETRIES}次)", backend_name=self.backend_name)
|
||||
|
|
@ -0,0 +1,292 @@
|
|||
# tts_voice_plugin - 自动生成的配置文件
|
||||
# 统一TTS语音合成插件,整合AI Voice、GSV2P、GPT-SoVITS、豆包语音、CosyVoice五种后端引擎,提供灵活的语音合成能力。
|
||||
|
||||
# 插件基本配置
|
||||
[plugin]
|
||||
|
||||
# 是否启用插件
|
||||
enabled = true
|
||||
|
||||
# 配置文件版本
|
||||
config_version = "3.2.3"
|
||||
|
||||
# 通用设置
|
||||
|
||||
[general]
|
||||
|
||||
# 默认TTS后端 (ai_voice/gsv2p/gpt_sovits/doubao/cosyvoice/comfyui)
|
||||
# 可选: ai_voice/gsv2p/gpt_sovits/doubao/cosyvoice/comfyui/comfyui_voiceclone/comfyui_customvoice
|
||||
default_backend = "comfyui_customvoice"
|
||||
|
||||
# 请求超时时间(秒)
|
||||
timeout = 60
|
||||
|
||||
# 最大文本长度(该限制会在调用LLM时注入到prompt中,让LLM直接生成符合长度的回复,而不是被动截断)
|
||||
max_text_length = 200
|
||||
|
||||
# 是否使用replyer润色语音内容
|
||||
use_replyer_rewrite = true
|
||||
|
||||
# 音频文件输出目录(支持相对路径和绝对路径,留空使用项目根目录)
|
||||
audio_output_dir = ""
|
||||
|
||||
# 是否使用base64编码发送音频(备选方案)
|
||||
use_base64_audio = true
|
||||
|
||||
# 是否分段发送语音(每句话单独发送一条语音,避免长语音播放问题)
|
||||
split_sentences = true
|
||||
|
||||
# 分段发送时每条语音之间的延迟(秒)
|
||||
split_delay = 0.3
|
||||
|
||||
# 自动分段启用阈值:文本长度小于该值时不分段(避免短句被切成多段)
|
||||
split_min_total_chars = 120
|
||||
|
||||
# 句子最小长度:过短片段会合并到前一句(用于减少碎片段)
|
||||
split_min_sentence_chars = 6
|
||||
|
||||
# 自动分段最大段数(避免刷屏式多段语音)。0 表示不限制。
|
||||
split_max_segments = 3
|
||||
|
||||
# 自动分段打包目标长度(字符)。用于把多句合并成更少段。
|
||||
split_chunk_chars = 110
|
||||
|
||||
# 是否发送错误提示消息(关闭后语音合成失败时不会发送错误信息给用户)
|
||||
send_error_messages = true
|
||||
|
||||
# 组件启用控制
|
||||
|
||||
[components]
|
||||
|
||||
# 是否启用Action组件
|
||||
action_enabled = true
|
||||
|
||||
# 是否启用Command组件
|
||||
command_enabled = true
|
||||
|
||||
# 是否启用 instruct 调试命令组件(/tts_instruct)
|
||||
instruct_command_enabled = true
|
||||
|
||||
# 概率控制配置
|
||||
|
||||
[probability]
|
||||
|
||||
# 是否启用概率控制
|
||||
enabled = true
|
||||
|
||||
# 基础触发概率
|
||||
base_probability = 1
|
||||
|
||||
# 关键词强制触发
|
||||
keyword_force_trigger = true
|
||||
|
||||
# 强制触发关键词
|
||||
force_keywords = [
|
||||
"一定要用语音",
|
||||
"必须语音",
|
||||
"语音回复我",
|
||||
"务必用语音",
|
||||
]
|
||||
|
||||
# AI Voice后端配置
|
||||
|
||||
[ai_voice]
|
||||
|
||||
# 默认音色(可选:小新、猴哥、四郎、东北老妹儿、广西大表哥、妲己、霸道总裁、酥心御姐、说书先生、憨憨小弟、憨厚老哥、吕布、元气少女、文艺少女、磁性大叔、邻家小妹、低沉男声、傲娇少女、爹系男友、暖心姐姐、温柔妹妹、书香少女)
|
||||
default_character = "邻家小妹"
|
||||
|
||||
# GSV2P后端配置
|
||||
|
||||
[gsv2p]
|
||||
|
||||
# GSV2P API地址
|
||||
api_url = "https://gsv2p.acgnai.top/v1/audio/speech"
|
||||
|
||||
# API认证Token
|
||||
api_token = ""
|
||||
|
||||
# 默认音色
|
||||
default_voice = "原神-中文-派蒙_ZH"
|
||||
|
||||
# API请求超时(秒)
|
||||
timeout = 149
|
||||
|
||||
# TTS模型
|
||||
model = "tts-v4"
|
||||
|
||||
# 音频格式
|
||||
response_format = "wav"
|
||||
|
||||
# 语音速度
|
||||
speed = 1
|
||||
|
||||
# GPT-SoVITS后端配置
|
||||
|
||||
[gpt_sovits]
|
||||
|
||||
# GPT-SoVITS服务地址
|
||||
server = "http://127.0.0.1:9880"
|
||||
|
||||
# 语音风格配置
|
||||
|
||||
# 豆包语音后端配置
|
||||
|
||||
[[gpt_sovits.styles]]
|
||||
name = "default"
|
||||
refer_wav = "/Users/xenon/Downloads/seiun_tts/qingyun_tiankong_voice/s978ztt245c3jxms6apadwgna4e7hmb.mp3"
|
||||
prompt_text = "私にしてはがんばった方ではないでしょーか?"
|
||||
prompt_language = "ja"
|
||||
gpt_weights = "/Users/xenon/Downloads/GPT-SoVITS/GPT_weights_v4/seiun-e15.ckpt"
|
||||
sovits_weights = "/Users/xenon/Downloads/GPT-SoVITS/SoVITS_weights_v4/seiun_e2_s144_l32.pth"
|
||||
|
||||
[[gpt_sovits.styles]]
|
||||
name = ""
|
||||
refer_wav = ""
|
||||
prompt_text = ""
|
||||
prompt_language = "zh"
|
||||
gpt_weights = ""
|
||||
sovits_weights = ""
|
||||
|
||||
[doubao]
|
||||
|
||||
# 豆包语音API地址
|
||||
api_url = "https://openspeech.bytedance.com/api/v3/tts/unidirectional"
|
||||
|
||||
# 豆包APP ID
|
||||
app_id = ""
|
||||
|
||||
# 豆包Access Key
|
||||
access_key = ""
|
||||
|
||||
# 豆包Resource ID
|
||||
resource_id = "seed-tts-2.0"
|
||||
|
||||
# 默认音色
|
||||
default_voice = "zh_female_vv_uranus_bigtts"
|
||||
|
||||
# API请求超时(秒)
|
||||
timeout = 60
|
||||
|
||||
# 音频格式
|
||||
audio_format = "wav"
|
||||
|
||||
# 采样率
|
||||
sample_rate = 24000
|
||||
|
||||
# 比特率
|
||||
bitrate = 128000
|
||||
|
||||
# CosyVoice后端配置
|
||||
|
||||
[cosyvoice]
|
||||
|
||||
# Gradio API地址
|
||||
gradio_url = "https://funaudiollm-fun-cosyvoice3-0-5b.ms.show/"
|
||||
|
||||
# 推理模式(3s极速复刻/自然语言控制)
|
||||
default_mode = "3s极速复刻"
|
||||
|
||||
# 默认指令(用于自然语言控制模式)
|
||||
default_instruct = "You are a helpful assistant. 请用广东话表达。<|endofprompt|>"
|
||||
|
||||
# 参考音频路径(用于3s极速复刻模式)
|
||||
reference_audio = ""
|
||||
|
||||
# 提示文本(用于3s极速复刻模式)
|
||||
prompt_text = ""
|
||||
|
||||
# API请求超时(秒)
|
||||
timeout = 300
|
||||
|
||||
# 音频格式
|
||||
audio_format = "wav"
|
||||
|
||||
[comfyui]
|
||||
server = "http://127.0.0.1:8188"
|
||||
# 必须是 ComfyUI 的 input 目录, backend 会把 refer_wav 复制进去, 再用 LoadAudio 读取
|
||||
input_dir = "/Users/xenon/Downloads/seiun_tts/ComfyUI/ComfyUI/input"
|
||||
timeout = 120
|
||||
audio_quality = "128k" # SaveAudioMP3: V0/128k/320k
|
||||
mlx_python = "/Users/xenon/Downloads/seiun_tts/qwen3-tts-apple-silicon/.venv/bin/python"
|
||||
mlx_cli = "/Users/xenon/Downloads/seiun_tts/qwen3-tts-apple-silicon/mlx_voice_clone_cli.py"
|
||||
default_style = "default"
|
||||
# Split comfyui backend into two convenient aliases:
|
||||
# - comfyui_voiceclone: only uses styles whose mode is voice_clone (or absent)
|
||||
# - comfyui_customvoice: only uses styles whose mode is custom_voice
|
||||
# These keys let you pick different defaults without duplicating comfyui.styles.
|
||||
voiceclone_default_style = "default"
|
||||
customvoice_default_style = "seiun"
|
||||
auto_instruct_enabled = true
|
||||
auto_instruct_max_chars = 320
|
||||
|
||||
# 自动推断 instruct 时固定附加的“基调”(persona)。会作为 `基调=...;` 前缀插入。
|
||||
# 注意:值里不要包含 ';' 或 '='(backend 会做清洗,但建议从源头避免)。
|
||||
auto_instruct_base_tone = "女性约15-16岁,清澈透亮但慵懒的轻女高音,句尾元音随意拉长且略带鼻腔撒娇,咬字松弛像刚睡醒,可在慵懒与冷静锐利间切换,带戏谑亲和"
|
||||
|
||||
# 可选:完整基调原文(保留备份,当前不启用)
|
||||
# auto_instruct_base_tone = """
|
||||
# 女性,外表约15-16岁,音色是清澈透亮却带有慵懒感的轻女高音(Light Soprano)。
|
||||
#
|
||||
# 嗓音轻盈飘逸,带有明显的“云朵般”的漂浮感,起初是漫不经心的拖沓语调,其特征在于句尾元音的随意拉长(Drawl)以及略带鼻腔共鸣的撒娇感。咬字呈现出一种仿佛刚睡醒般的松弛,甚至伴有刻意为之的含糊,像是一只在阳光下伸懒腰的猫。
|
||||
#
|
||||
# 随后,这种慵懒被一种狡黠的机敏所取代,声音在毫无干劲的叹息与看穿一切的通透感之间自如切换。在表现谋略或胜负欲的瞬间,音色会瞬间收紧,去除了所有的气声装饰与慵懒拖音,转为冷静、干练且直击要害的中高频。
|
||||
#
|
||||
# 表现风格既显得捉摸不透又带有戏谑的亲和力,伴随着轻巧的换气声和偶尔出现的、带有试探意味的升调尾音。仿佛在脱力系(Listless)的无害表象之下,潜藏着绝顶聪明的头脑与绝不让步的自尊。
|
||||
# """
|
||||
|
||||
auto_instruct_prompt = """
|
||||
你是精通声学特征与戏剧表演的 AI 配音导演。你的任务是根据「待朗读文本」生成一行 TTS instruct(用于 Qwen3-TTS CustomVoice 的语音表演控制)。
|
||||
|
||||
硬性要求:
|
||||
- 只输出一行(单行 KV),不要解释,不要引号/代码块,不要复述原文。
|
||||
- 必须同时包含以下字段,并用英文分号 ';' 分隔:情绪、强度、语速、停顿、表现
|
||||
- 输出格式固定为:情绪=<...>;强度=<...>;语速=<...>;停顿=<...>;表现=<...>
|
||||
- 语速可选:很慢/稍慢/正常/稍快/很快
|
||||
- 停顿可选:很少/自然/稍多/很多
|
||||
- 强度可选:很低/低/中/高/很高
|
||||
- 表现:用 3-6 个短提示词,使用逗号分隔(不要用分号),如:声压高,咬字重,重音强,尾音下压
|
||||
- 长度 <= {max_chars} 字
|
||||
|
||||
强制增强规则(避免“生气但听起来不够生气”):
|
||||
- 如果文本出现:非常/极其/真的/气死/怒/吼/滚/闭嘴/你再说一次 等强烈信号,情绪优先用「愤怒」,强度至少「高」,表现要包含“声压高/咬字重/重音强/尾音下压”中的至少 2 项。
|
||||
- 如果是嘲讽或冷笑式的怒气:情绪写「愤怒(冷)」或「愤怒+嘲讽」,表现包含“冷硬/压低/咬字利落/少气声”。
|
||||
|
||||
文本语言: {lang}
|
||||
待朗读文本: {text}
|
||||
"""
|
||||
|
||||
# 基础停顿(秒)。当 instruct 包含“停顿=...”时,会按 很少/自然/稍多/很多 做倍率缩放。
|
||||
pause_linebreak = 0.18
|
||||
period_pause = 0.22
|
||||
comma_pause = 0.1
|
||||
question_pause = 0.2
|
||||
hyphen_pause = 0.06
|
||||
|
||||
[[comfyui.styles]]
|
||||
name = "default"
|
||||
refer_wav = "/Users/xenon/Downloads/seiun_tts/qingyun_tiankong_voice/default_ref_24k_mono.wav"
|
||||
prompt_text = "私にしてはがんばった方ではないでしょーか?"
|
||||
language = "Auto"
|
||||
model_choice = "1.7B"
|
||||
precision = "bf16"
|
||||
seed = 0
|
||||
max_new_tokens = 2048
|
||||
top_p = 0.8
|
||||
top_k = 20
|
||||
temperature = 1
|
||||
repetition_penalty = 1.05
|
||||
|
||||
[[comfyui.styles]]
|
||||
name = "seiun"
|
||||
mode = "custom_voice"
|
||||
model_path = "/Users/xenon/Downloads/checkpoint-epoch-9"
|
||||
speaker = "seiun"
|
||||
instruct = "__AUTO__"
|
||||
speed = 1
|
||||
language = "Auto"
|
||||
seed = 0
|
||||
max_new_tokens = 2048
|
||||
top_p = 0.9
|
||||
top_k = 20
|
||||
temperature = 0.9
|
||||
repetition_penalty = 1.05
|
||||
|
|
@ -0,0 +1,103 @@
|
|||
"""
|
||||
配置键常量定义
|
||||
集中管理所有配置键,避免硬编码
|
||||
"""
|
||||
|
||||
|
||||
class ConfigKeys:
|
||||
"""配置键常量类"""
|
||||
|
||||
# ========== Plugin 配置 ==========
|
||||
PLUGIN_ENABLED = "plugin.enabled"
|
||||
PLUGIN_CONFIG_VERSION = "plugin.config_version"
|
||||
|
||||
# ========== General 通用配置 ==========
|
||||
GENERAL_DEFAULT_BACKEND = "general.default_backend"
|
||||
GENERAL_TIMEOUT = "general.timeout"
|
||||
GENERAL_MAX_TEXT_LENGTH = "general.max_text_length"
|
||||
GENERAL_USE_REPLYER_REWRITE = "general.use_replyer_rewrite"
|
||||
GENERAL_AUDIO_OUTPUT_DIR = "general.audio_output_dir"
|
||||
GENERAL_USE_BASE64_AUDIO = "general.use_base64_audio"
|
||||
GENERAL_SPLIT_SENTENCES = "general.split_sentences"
|
||||
GENERAL_SPLIT_DELAY = "general.split_delay"
|
||||
GENERAL_SPLIT_MIN_TOTAL_CHARS = "general.split_min_total_chars"
|
||||
GENERAL_SPLIT_MIN_SENTENCE_CHARS = "general.split_min_sentence_chars"
|
||||
GENERAL_SPLIT_MAX_SEGMENTS = "general.split_max_segments"
|
||||
GENERAL_SPLIT_CHUNK_CHARS = "general.split_chunk_chars"
|
||||
GENERAL_SEND_ERROR_MESSAGES = "general.send_error_messages"
|
||||
|
||||
# ========== Components 组件配置 ==========
|
||||
COMPONENTS_ACTION_ENABLED = "components.action_enabled"
|
||||
COMPONENTS_COMMAND_ENABLED = "components.command_enabled"
|
||||
COMPONENTS_INSTRUCT_COMMAND_ENABLED = "components.instruct_command_enabled"
|
||||
|
||||
# ========== Probability 概率控制配置 ==========
|
||||
PROBABILITY_ENABLED = "probability.enabled"
|
||||
PROBABILITY_BASE_PROBABILITY = "probability.base_probability"
|
||||
PROBABILITY_KEYWORD_FORCE_TRIGGER = "probability.keyword_force_trigger"
|
||||
PROBABILITY_FORCE_KEYWORDS = "probability.force_keywords"
|
||||
|
||||
# ========== AI Voice 配置 ==========
|
||||
AI_VOICE_DEFAULT_CHARACTER = "ai_voice.default_character"
|
||||
AI_VOICE_ALIAS_MAP = "ai_voice.alias_map"
|
||||
|
||||
# ========== GSV2P 配置 ==========
|
||||
GSV2P_API_URL = "gsv2p.api_url"
|
||||
GSV2P_API_TOKEN = "gsv2p.api_token"
|
||||
GSV2P_DEFAULT_VOICE = "gsv2p.default_voice"
|
||||
GSV2P_TIMEOUT = "gsv2p.timeout"
|
||||
GSV2P_MODEL = "gsv2p.model"
|
||||
GSV2P_RESPONSE_FORMAT = "gsv2p.response_format"
|
||||
GSV2P_SPEED = "gsv2p.speed"
|
||||
|
||||
# ========== GPT-SoVITS 配置 ==========
|
||||
GPT_SOVITS_SERVER = "gpt_sovits.server"
|
||||
GPT_SOVITS_STYLES = "gpt_sovits.styles"
|
||||
|
||||
# ========== Doubao 豆包配置 ==========
|
||||
DOUBAO_API_URL = "doubao.api_url"
|
||||
DOUBAO_APP_ID = "doubao.app_id"
|
||||
DOUBAO_ACCESS_KEY = "doubao.access_key"
|
||||
DOUBAO_RESOURCE_ID = "doubao.resource_id"
|
||||
DOUBAO_DEFAULT_VOICE = "doubao.default_voice"
|
||||
DOUBAO_TIMEOUT = "doubao.timeout"
|
||||
DOUBAO_AUDIO_FORMAT = "doubao.audio_format"
|
||||
DOUBAO_SAMPLE_RATE = "doubao.sample_rate"
|
||||
DOUBAO_BITRATE = "doubao.bitrate"
|
||||
DOUBAO_SPEED = "doubao.speed"
|
||||
DOUBAO_VOLUME = "doubao.volume"
|
||||
DOUBAO_CONTEXT_TEXTS = "doubao.context_texts"
|
||||
|
||||
# ========== CosyVoice 配置 ==========
|
||||
COSYVOICE_GRADIO_URL = "cosyvoice.gradio_url"
|
||||
COSYVOICE_DEFAULT_MODE = "cosyvoice.default_mode"
|
||||
COSYVOICE_DEFAULT_INSTRUCT = "cosyvoice.default_instruct"
|
||||
COSYVOICE_REFERENCE_AUDIO = "cosyvoice.reference_audio"
|
||||
COSYVOICE_PROMPT_TEXT = "cosyvoice.prompt_text"
|
||||
COSYVOICE_TIMEOUT = "cosyvoice.timeout"
|
||||
COSYVOICE_AUDIO_FORMAT = "cosyvoice.audio_format"
|
||||
|
||||
# ========== ComfyUI (Workflow API) 配置 ==========
|
||||
COMFYUI_SERVER = "comfyui.server"
|
||||
COMFYUI_INPUT_DIR = "comfyui.input_dir"
|
||||
COMFYUI_TIMEOUT = "comfyui.timeout"
|
||||
COMFYUI_DEFAULT_STYLE = "comfyui.default_style"
|
||||
COMFYUI_STYLES = "comfyui.styles"
|
||||
# Convenience aliases to split voiceclone/customvoice at the plugin level.
|
||||
# Both backends still use comfyui.styles, but these keys let you pick different defaults.
|
||||
COMFYUI_VOICECLONE_DEFAULT_STYLE = "comfyui.voiceclone_default_style"
|
||||
COMFYUI_CUSTOMVOICE_DEFAULT_STYLE = "comfyui.customvoice_default_style"
|
||||
COMFYUI_AUDIO_QUALITY = "comfyui.audio_quality"
|
||||
COMFYUI_MLX_PYTHON = "comfyui.mlx_python"
|
||||
COMFYUI_MLX_CLI = "comfyui.mlx_cli"
|
||||
COMFYUI_PAUSE_LINEBREAK = "comfyui.pause_linebreak"
|
||||
COMFYUI_PERIOD_PAUSE = "comfyui.period_pause"
|
||||
COMFYUI_COMMA_PAUSE = "comfyui.comma_pause"
|
||||
COMFYUI_QUESTION_PAUSE = "comfyui.question_pause"
|
||||
COMFYUI_HYPHEN_PAUSE = "comfyui.hyphen_pause"
|
||||
|
||||
# Auto instruct (CustomVoice)
|
||||
COMFYUI_AUTO_INSTRUCT_ENABLED = "comfyui.auto_instruct_enabled"
|
||||
COMFYUI_AUTO_INSTRUCT_BASE_TONE = "comfyui.auto_instruct_base_tone"
|
||||
COMFYUI_AUTO_INSTRUCT_PROMPT = "comfyui.auto_instruct_prompt"
|
||||
COMFYUI_AUTO_INSTRUCT_MAX_CHARS = "comfyui.auto_instruct_max_chars"
|
||||
|
|
@ -0,0 +1,972 @@
|
|||
"""
|
||||
统一TTS语音合成插件
|
||||
支持五种后端:AI Voice (MaiCore内置) / GSV2P (云API) / GPT-SoVITS (本地服务) / 豆包语音 (云API) / CosyVoice (ModelScope Gradio)
|
||||
|
||||
Version: 3.2.3
|
||||
Author: 靓仔
|
||||
"""
|
||||
|
||||
import sys
|
||||
sys.dont_write_bytecode = True
|
||||
|
||||
import asyncio
|
||||
import random
|
||||
from typing import List, Tuple, Type, Optional
|
||||
|
||||
from src.common.logger import get_logger
|
||||
from src.plugin_system.base.base_plugin import BasePlugin
|
||||
from src.plugin_system.apis.plugin_register_api import register_plugin
|
||||
from src.plugin_system.base.base_action import BaseAction, ActionActivationType
|
||||
from src.plugin_system.base.base_command import BaseCommand
|
||||
from src.plugin_system.base.component_types import ComponentInfo, ChatMode
|
||||
from src.plugin_system.base.config_types import ConfigField
|
||||
from src.plugin_system.apis import generator_api
|
||||
|
||||
# 导入模块化的后端和工具
|
||||
from .backends import TTSBackendRegistry, TTSResult
|
||||
from .backends.ai_voice import AI_VOICE_ALIAS_MAP
|
||||
from .backends.doubao import DOUBAO_EMOTION_MAP
|
||||
from .utils.text import TTSTextUtils
|
||||
from .config_keys import ConfigKeys
|
||||
|
||||
logger = get_logger("tts_voice_plugin")
|
||||
|
||||
# 有效后端列表
|
||||
VALID_BACKENDS = [
|
||||
"ai_voice",
|
||||
"gsv2p",
|
||||
"gpt_sovits",
|
||||
"doubao",
|
||||
"cosyvoice",
|
||||
"comfyui",
|
||||
"comfyui_voiceclone",
|
||||
"comfyui_customvoice",
|
||||
]
|
||||
|
||||
|
||||
class TTSExecutorMixin:
|
||||
"""
|
||||
TTS执行器混入类
|
||||
|
||||
提供 Action 和 Command 共享的后端执行逻辑
|
||||
"""
|
||||
|
||||
def _create_backend(self, backend_name: str):
|
||||
"""
|
||||
创建后端实例
|
||||
|
||||
Args:
|
||||
backend_name: 后端名称
|
||||
|
||||
Returns:
|
||||
后端实例
|
||||
"""
|
||||
backend = TTSBackendRegistry.create(
|
||||
backend_name,
|
||||
self.get_config,
|
||||
self.log_prefix
|
||||
)
|
||||
|
||||
if backend:
|
||||
# 注入必要的回调函数
|
||||
if hasattr(backend, 'set_send_custom'):
|
||||
backend.set_send_custom(self.send_custom)
|
||||
if hasattr(backend, 'set_send_command'):
|
||||
backend.set_send_command(self.send_command)
|
||||
|
||||
return backend
|
||||
|
||||
async def _execute_backend(
|
||||
self,
|
||||
backend_name: str,
|
||||
text: str,
|
||||
voice: str = "",
|
||||
emotion: str = ""
|
||||
) -> TTSResult:
|
||||
"""
|
||||
执行指定后端
|
||||
|
||||
Args:
|
||||
backend_name: 后端名称
|
||||
text: 待转换文本
|
||||
voice: 音色
|
||||
emotion: 情感(豆包后端)
|
||||
|
||||
Returns:
|
||||
TTSResult
|
||||
"""
|
||||
backend = self._create_backend(backend_name)
|
||||
|
||||
if not backend:
|
||||
return TTSResult(
|
||||
success=False,
|
||||
message=f"未知的TTS后端: {backend_name}"
|
||||
)
|
||||
|
||||
# AI Voice 私聊限制检查
|
||||
if backend_name == "ai_voice":
|
||||
is_private = self._check_is_private_chat()
|
||||
if is_private:
|
||||
logger.info(f"{self.log_prefix} AI语音仅支持群聊,自动切换到GSV2P后端")
|
||||
return await self._execute_backend("gsv2p", text, voice, emotion)
|
||||
|
||||
# Pass chat context through for backends that need MaiBot LLM APIs (e.g., comfyui auto_instruct).
|
||||
chat_stream = None
|
||||
if hasattr(self, "chat_stream"):
|
||||
chat_stream = getattr(self, "chat_stream", None)
|
||||
elif hasattr(self, "message"):
|
||||
chat_stream = getattr(getattr(self, "message", None), "chat_stream", None)
|
||||
|
||||
return await backend.execute(text, voice, emotion=emotion, chat_stream=chat_stream)
|
||||
|
||||
def _check_is_private_chat(self) -> bool:
|
||||
"""检查是否是私聊"""
|
||||
# Action 中使用 chat_stream
|
||||
if hasattr(self, 'chat_stream'):
|
||||
return not getattr(self.chat_stream, 'group_info', None)
|
||||
# Command 中使用 message
|
||||
if hasattr(self, 'message'):
|
||||
msg_info = getattr(self.message, 'message_info', None)
|
||||
if msg_info:
|
||||
return not getattr(msg_info, 'group_info', None)
|
||||
return False
|
||||
|
||||
def _get_default_backend(self) -> str:
|
||||
"""获取配置的默认后端"""
|
||||
backend = self.get_config(ConfigKeys.GENERAL_DEFAULT_BACKEND, "gsv2p")
|
||||
if backend not in VALID_BACKENDS:
|
||||
logger.warning(f"{self.log_prefix} 配置的默认后端 '{backend}' 无效,使用 gsv2p")
|
||||
return "gsv2p"
|
||||
return backend
|
||||
|
||||
async def _send_error(self, message: str) -> None:
|
||||
"""
|
||||
发送错误提示信息(受全局配置控制)
|
||||
|
||||
Args:
|
||||
message: 错误消息
|
||||
"""
|
||||
if self.get_config(ConfigKeys.GENERAL_SEND_ERROR_MESSAGES, True):
|
||||
await self.send_text(message)
|
||||
|
||||
|
||||
class UnifiedTTSAction(BaseAction, TTSExecutorMixin):
|
||||
"""统一TTS Action - LLM自动触发"""
|
||||
|
||||
action_name = "unified_tts_action"
|
||||
action_description = "用语音回复(支持AI Voice/GSV2P/GPT-SoVITS/豆包语音多后端)"
|
||||
activation_type = ActionActivationType.KEYWORD
|
||||
mode_enable = ChatMode.ALL
|
||||
parallel_action = False
|
||||
|
||||
activation_keywords = [
|
||||
"语音", "说话", "朗读", "念一下", "读出来",
|
||||
"voice", "speak", "tts", "语音回复", "用语音说", "播报"
|
||||
]
|
||||
keyword_case_sensitive = False
|
||||
|
||||
action_parameters = {
|
||||
"text": "要转换为语音的文本内容(必填)",
|
||||
"backend": "TTS后端引擎 (ai_voice/gsv2p/gpt_sovits/doubao/cosyvoice/comfyui/comfyui_voiceclone/comfyui_customvoice,可选,建议省略让系统自动使用配置的默认后端)",
|
||||
"voice": "音色/风格参数(可选)",
|
||||
"emotion": "情感/语气参数(可选,仅豆包后端有效)。支持:开心/兴奋/温柔/骄傲/生气/愤怒/伤心/失望/委屈/平静/严肃/疑惑/慢速/快速/小声/大声等"
|
||||
}
|
||||
|
||||
action_require = [
|
||||
"当用户要求用语音回复时使用",
|
||||
"当回复简短问候语时使用(如早上好、晚安、你好等)",
|
||||
"当想让回复更活泼生动时可以使用",
|
||||
"注意:回复内容过长或者过短不适合用语音",
|
||||
"注意:backend参数建议省略,系统会自动使用配置的默认后端"
|
||||
]
|
||||
|
||||
associated_types = ["text", "command"]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.timeout = self.get_config(ConfigKeys.GENERAL_TIMEOUT, 60)
|
||||
self.max_text_length = self.get_config(ConfigKeys.GENERAL_MAX_TEXT_LENGTH, 500)
|
||||
|
||||
def _check_force_trigger(self, text: str) -> bool:
|
||||
"""检查是否强制触发"""
|
||||
if not self.get_config(ConfigKeys.PROBABILITY_KEYWORD_FORCE_TRIGGER, True):
|
||||
return False
|
||||
force_keywords = self.get_config(
|
||||
ConfigKeys.PROBABILITY_FORCE_KEYWORDS,
|
||||
["一定要用语音", "必须语音", "语音回复我", "务必用语音"]
|
||||
)
|
||||
return any(kw in text for kw in force_keywords)
|
||||
|
||||
def _probability_check(self, text: str) -> bool:
|
||||
"""概率控制检查"""
|
||||
if not self.get_config(ConfigKeys.PROBABILITY_ENABLED, True):
|
||||
return True
|
||||
|
||||
base_prob = self.get_config(ConfigKeys.PROBABILITY_BASE_PROBABILITY, 1.0)
|
||||
base_prob = max(0.0, min(1.0, base_prob))
|
||||
result = random.random() < base_prob
|
||||
logger.info(f"{self.log_prefix} 概率检查: {base_prob:.2f}, 结果={'通过' if result else '未通过'}")
|
||||
return result
|
||||
|
||||
async def _get_final_text(self, raw_text: str, reason: str, use_replyer: bool) -> Tuple[bool, str]:
|
||||
"""获取最终要转语音的文本(使用与正常回复一致的prompt参数)"""
|
||||
max_text_length = self.get_config(ConfigKeys.GENERAL_MAX_TEXT_LENGTH, 200)
|
||||
|
||||
if not use_replyer:
|
||||
if not raw_text:
|
||||
return False, ""
|
||||
return True, raw_text
|
||||
|
||||
try:
|
||||
# 统一使用 generate_reply 以确保触发 POST_LLM 事件(日程注入)
|
||||
# rewrite_reply 不会触发 POST_LLM 事件,因此不适用
|
||||
# 注意:长度约束放在末尾,利用 LLM 的"近因效应"提高遵守率
|
||||
extra_info_parts = []
|
||||
if raw_text:
|
||||
extra_info_parts.append(f"期望的回复内容:{raw_text}")
|
||||
# 长度约束放在最后,使用更强的表述
|
||||
extra_info_parts.append(
|
||||
f"【重要】你的回复必须控制在{max_text_length}字以内,这是硬性要求。"
|
||||
f"超过此长度将无法转换为语音。请直接回复核心内容,不要啰嗦。"
|
||||
)
|
||||
|
||||
success, llm_response = await generator_api.generate_reply(
|
||||
chat_stream=self.chat_stream,
|
||||
reply_message=self.action_message,
|
||||
reply_reason=reason,
|
||||
extra_info="\n".join(extra_info_parts),
|
||||
request_type="tts_voice_plugin",
|
||||
from_plugin=False # 允许触发POST_LLM事件,使日程注入生效
|
||||
)
|
||||
if success and llm_response and llm_response.content:
|
||||
logger.info(f"{self.log_prefix} 语音内容生成成功")
|
||||
return True, llm_response.content.strip()
|
||||
|
||||
# 如果生成失败但有原始文本,则使用原始文本
|
||||
if raw_text:
|
||||
logger.warning(f"{self.log_prefix} 内容生成失败,使用原始文本")
|
||||
return True, raw_text
|
||||
|
||||
return False, ""
|
||||
except Exception as e:
|
||||
logger.error(f"{self.log_prefix} 调用 replyer 出错: {e}")
|
||||
return bool(raw_text), raw_text
|
||||
|
||||
async def execute(self) -> Tuple[bool, str]:
|
||||
def _chunk_sentences(
|
||||
parts: List[str], target_chars: int, max_chunks: int
|
||||
) -> List[str]:
|
||||
# Greedy packing: reduces tiny fragments into fewer, longer segments.
|
||||
if not parts:
|
||||
return []
|
||||
if target_chars <= 0:
|
||||
target_chars = 120
|
||||
|
||||
def pack(tgt: int) -> List[str]:
|
||||
out: List[str] = []
|
||||
cur = ""
|
||||
for s in parts:
|
||||
s = (s or "").strip()
|
||||
if not s:
|
||||
continue
|
||||
if not cur:
|
||||
cur = s
|
||||
continue
|
||||
if len(cur) + len(s) <= tgt:
|
||||
cur += s
|
||||
else:
|
||||
out.append(cur)
|
||||
cur = s
|
||||
if cur:
|
||||
out.append(cur)
|
||||
return out
|
||||
|
||||
packed = pack(target_chars)
|
||||
if max_chunks and max_chunks > 0 and len(packed) > max_chunks:
|
||||
total = len("".join(parts))
|
||||
new_target = max(target_chars, int(total / max_chunks) + 1)
|
||||
packed = pack(new_target)
|
||||
return packed
|
||||
|
||||
async def send_message_single_sentences() -> Tuple[bool, str]:
|
||||
result = await self._execute_backend(backend, clean_text, voice, emotion)
|
||||
if result.success:
|
||||
# 生成更详细的动作记录,帮助 planner 避免重复执行
|
||||
text_preview = clean_text[:80] + "..." if len(clean_text) > 80 else clean_text
|
||||
await self.store_action_info(
|
||||
action_build_into_prompt=True,
|
||||
action_prompt_display=f"已用语音回复:{text_preview}",
|
||||
action_done=True
|
||||
)
|
||||
else:
|
||||
await self._send_error(f"语音合成失败: {result.message}")
|
||||
|
||||
return result.success, result.message
|
||||
async def send_message_with_splited_sentences() -> Tuple[bool, str]:
|
||||
# 分段发送模式:将文本分割成句子,逐句发送语音
|
||||
if len(sentences) > 1:
|
||||
logger.info(f"{self.log_prefix} 分段发送模式:共 {len(sentences)} 句")
|
||||
|
||||
success_count = 0
|
||||
all_sentences_text = []
|
||||
|
||||
for i, sentence in enumerate(sentences):
|
||||
if not sentence.strip():
|
||||
continue
|
||||
|
||||
logger.debug(f"{self.log_prefix} 发送第 {i + 1}/{len(sentences)} 句: {sentence[:30]}...")
|
||||
result = await self._execute_backend(backend, sentence, voice, emotion)
|
||||
|
||||
if result.success:
|
||||
success_count += 1
|
||||
all_sentences_text.append(sentence)
|
||||
else:
|
||||
logger.warning(f"{self.log_prefix} 第 {i + 1} 句发送失败: {result.message}")
|
||||
|
||||
# 句子之间添加延迟
|
||||
if i < len(sentences) - 1 and split_delay > 0:
|
||||
await asyncio.sleep(split_delay)
|
||||
|
||||
# 记录动作信息
|
||||
if success_count > 0:
|
||||
# 生成更详细的动作记录,帮助 planner 避免重复执行
|
||||
display_text = "".join(all_sentences_text)
|
||||
text_preview = display_text[:80] + "..." if len(display_text) > 80 else display_text
|
||||
await self.store_action_info(
|
||||
action_build_into_prompt=True,
|
||||
action_prompt_display=f"已用语音回复({success_count}段):{text_preview}",
|
||||
action_done=True
|
||||
)
|
||||
return True, f"成功发送 {success_count}/{len(sentences)} 条语音"
|
||||
else:
|
||||
await self._send_error("语音合成失败")
|
||||
return False, "所有语音发送失败"
|
||||
else:
|
||||
# 只有一句,正常发送
|
||||
return await send_message_single_sentences()
|
||||
|
||||
"""执行TTS语音合成"""
|
||||
try:
|
||||
raw_text = self.action_data.get("text", "").strip()
|
||||
voice = self.action_data.get("voice", "")
|
||||
reason = self.action_data.get("reason", "")
|
||||
emotion = self.action_data.get("emotion", "")
|
||||
|
||||
use_replyer = self.get_config(ConfigKeys.GENERAL_USE_REPLYER_REWRITE, True)
|
||||
|
||||
# 获取最终文本
|
||||
success, final_text = await self._get_final_text(raw_text, reason, use_replyer)
|
||||
if not success or not final_text:
|
||||
await self._send_error("无法生成语音内容")
|
||||
return False, "文本为空"
|
||||
|
||||
# 概率检查
|
||||
force_trigger = self._check_force_trigger(final_text)
|
||||
if not force_trigger and not self._probability_check(final_text):
|
||||
logger.info(f"{self.log_prefix} 概率检查未通过,使用文字回复")
|
||||
await self.send_text(final_text)
|
||||
text_preview = final_text[:80] + "..." if len(final_text) > 80 else final_text
|
||||
await self.store_action_info(
|
||||
action_build_into_prompt=True,
|
||||
action_prompt_display=f"已用文字回复(语音概率未触发):{text_preview}",
|
||||
action_done=True
|
||||
)
|
||||
return True, "概率检查未通过,已发送文字回复"
|
||||
|
||||
# 清理文本(移除特殊字符,替换网络用语)
|
||||
# 注意:长度应该由LLM在生成时就遵守,这里只做字符清理
|
||||
clean_text = TTSTextUtils.clean_text(final_text, self.max_text_length)
|
||||
if not clean_text:
|
||||
await self._send_error("文本处理后为空")
|
||||
return False, "文本处理后为空"
|
||||
|
||||
# 如果清理后的文本仍然超过限制,说明LLM未遵守约束
|
||||
if len(clean_text) > self.max_text_length:
|
||||
logger.warning(
|
||||
f"{self.log_prefix} LLM生成的文本超过长度限制 "
|
||||
f"({len(clean_text)} > {self.max_text_length}字符),降级为文字回复"
|
||||
)
|
||||
await self.send_text(clean_text)
|
||||
text_preview = clean_text[:80] + "..." if len(clean_text) > 80 else clean_text
|
||||
await self.store_action_info(
|
||||
action_build_into_prompt=True,
|
||||
action_prompt_display=f"已用文字回复(内容过长):{text_preview}",
|
||||
action_done=True
|
||||
)
|
||||
return True, "内容超过语音长度限制,已改为文字回复"
|
||||
|
||||
# 获取后端并执行
|
||||
backend = self._get_default_backend()
|
||||
logger.info(f"{self.log_prefix} 使用配置的默认后端: {backend}")
|
||||
|
||||
# 检查是否启用分段发送
|
||||
split_sentences = self.get_config(ConfigKeys.GENERAL_SPLIT_SENTENCES, True)
|
||||
split_delay = self.get_config(ConfigKeys.GENERAL_SPLIT_DELAY, 0.3)
|
||||
|
||||
sentences = None
|
||||
|
||||
# 优先使用智能分割插件的分隔符
|
||||
if '|||SPLIT|||' in clean_text:
|
||||
logger.info("found split marker from smart segmentation plugin")
|
||||
sentences = [s.strip() for s in clean_text.split("|||SPLIT|||") if s.strip()]
|
||||
# If the upstream splitter is too aggressive, pack back into fewer segments.
|
||||
max_segments = int(self.get_config(ConfigKeys.GENERAL_SPLIT_MAX_SEGMENTS, 3) or 3)
|
||||
chunk_chars = int(self.get_config(ConfigKeys.GENERAL_SPLIT_CHUNK_CHARS, 110) or 110)
|
||||
if max_segments and max_segments > 0 and len(sentences) > max_segments:
|
||||
sentences = _chunk_sentences(sentences, target_chars=chunk_chars, max_chunks=max_segments)
|
||||
return await send_message_with_splited_sentences()
|
||||
elif split_sentences:
|
||||
# 自动分段:短文本不分段;长文本最多分成 N 段,避免刷屏式多段语音。
|
||||
min_total = int(self.get_config(ConfigKeys.GENERAL_SPLIT_MIN_TOTAL_CHARS, 120) or 120)
|
||||
min_sentence = int(self.get_config(ConfigKeys.GENERAL_SPLIT_MIN_SENTENCE_CHARS, 6) or 6)
|
||||
max_segments = int(self.get_config(ConfigKeys.GENERAL_SPLIT_MAX_SEGMENTS, 3) or 3)
|
||||
chunk_chars = int(self.get_config(ConfigKeys.GENERAL_SPLIT_CHUNK_CHARS, 110) or 110)
|
||||
|
||||
if len(clean_text) < min_total:
|
||||
sentences = [clean_text]
|
||||
else:
|
||||
sentences = TTSTextUtils.split_sentences(clean_text, min_length=min_sentence)
|
||||
if max_segments and max_segments > 0:
|
||||
sentences = _chunk_sentences(sentences, target_chars=chunk_chars, max_chunks=max_segments)
|
||||
return await send_message_with_splited_sentences()
|
||||
else:
|
||||
# 单句发送
|
||||
return await send_message_single_sentences()
|
||||
|
||||
except Exception as e:
|
||||
error_msg = str(e)
|
||||
logger.error(f"{self.log_prefix} TTS语音合成出错: {error_msg}")
|
||||
await self._send_error(f"语音合成出错: {error_msg}")
|
||||
return False, error_msg
|
||||
|
||||
|
||||
class UnifiedTTSCommand(BaseCommand, TTSExecutorMixin):
|
||||
"""统一TTS Command - 用户手动触发"""
|
||||
|
||||
command_name = "unified_tts_command"
|
||||
command_description = "将文本转换为语音,支持多种后端和音色"
|
||||
command_pattern = r"^/(?:tts|voice|gsv2p|gptsovits|doubao|cosyvoice|comfyui|comfyui_voiceclone|comfyui_customvoice)\s+(?P<text>.+?)(?:\s+-v\s+(?P<voice>\S+))?(?:\s+(?P<backend>ai_voice|gsv2p|gpt_sovits|doubao|cosyvoice|comfyui|comfyui_voiceclone|comfyui_customvoice))?$"
|
||||
command_help = "将文本转换为语音。用法:/tts 你好世界 [-v 音色] [后端]"
|
||||
command_examples = [
|
||||
"/tts 你好,世界!",
|
||||
"/tts 今天天气不错 -v 小新",
|
||||
"/gptsovits 你好世界 -v default",
|
||||
"/cosyvoice 你好世界 -v 四川话",
|
||||
"/tts 试试 -v 温柔妹妹 ai_voice",
|
||||
"/gsv2p 你好世界",
|
||||
"/doubao 你好世界 -v 开心"
|
||||
]
|
||||
intercept_message = True
|
||||
|
||||
async def _send_help(self):
|
||||
"""发送帮助信息"""
|
||||
default_backend = self._get_default_backend()
|
||||
|
||||
help_text = """【TTS语音合成插件帮助】
|
||||
|
||||
📝 基本语法:
|
||||
/tts <文本> [-v <音色>] [后端]
|
||||
|
||||
🎯 快捷命令:
|
||||
/tts <文本> 使用默认后端
|
||||
/voice <文本> 使用 AI Voice
|
||||
/gsv2p <文本> 使用 GSV2P
|
||||
/gptsovits <文本> 使用 GPT-SoVITS
|
||||
/doubao <文本> 使用 豆包语音
|
||||
/cosyvoice <文本> 使用 CosyVoice
|
||||
/comfyui <文本> 使用 ComfyUI(本地工作流)
|
||||
/comfyui_voiceclone <文本> 使用 ComfyUI VoiceClone
|
||||
/comfyui_customvoice <文本> 使用 ComfyUI CustomVoice
|
||||
|
||||
🔊 可用后端:
|
||||
• ai_voice - MaiCore内置(仅群聊)
|
||||
• gsv2p - 云端API,高质量
|
||||
• gpt_sovits - 本地服务,可定制
|
||||
• doubao - 火山引擎,支持情感
|
||||
• cosyvoice - 阿里云,支持方言
|
||||
• comfyui - 本地ComfyUI工作流(自动按 style.mode 选择)
|
||||
• comfyui_voiceclone - 本地ComfyUI工作流(仅 VoiceClone)
|
||||
• comfyui_customvoice - 本地ComfyUI工作流(仅 CustomVoice)
|
||||
|
||||
🎭 音色/情感参数(-v):
|
||||
• AI Voice: 小新、温柔妹妹、霸道总裁、妲己 等22种
|
||||
• GSV2P: 原神-中文-派蒙_ZH 等(见API文档)
|
||||
• 豆包: 开心、生气、伤心、撒娇、严肃 等
|
||||
• CosyVoice: 广东话、四川话、东北话、开心、慢速 等
|
||||
|
||||
📌 示例:
|
||||
/tts 你好世界
|
||||
/tts 今天真开心 -v 开心
|
||||
/gptsovits 这是本地语音合成
|
||||
/doubao 我生气了 -v 生气
|
||||
/cosyvoice 你好 -v 广东话
|
||||
/voice 测试一下 -v 温柔妹妹
|
||||
|
||||
⚙️ 当前默认后端:""" + default_backend
|
||||
|
||||
await self.send_text(help_text)
|
||||
|
||||
def _determine_backend(self, user_backend: str) -> Tuple[str, str]:
|
||||
"""
|
||||
确定使用的后端
|
||||
|
||||
Returns:
|
||||
(backend_name, source_description)
|
||||
"""
|
||||
# 1. 检查命令前缀
|
||||
raw_text = self.message.raw_message if self.message.raw_message else self.message.processed_plain_text
|
||||
if raw_text:
|
||||
# 命令前缀到后端的映射
|
||||
prefix_backend_map = {
|
||||
"/gsv2p": "gsv2p",
|
||||
"/gptsovits": "gpt_sovits",
|
||||
"/doubao": "doubao",
|
||||
"/cosyvoice": "cosyvoice",
|
||||
"/voice": "ai_voice",
|
||||
"/comfyui": "comfyui",
|
||||
"/comfyui_voiceclone": "comfyui_voiceclone",
|
||||
"/comfyui_customvoice": "comfyui_customvoice",
|
||||
}
|
||||
for prefix, backend in prefix_backend_map.items():
|
||||
if raw_text.startswith(prefix):
|
||||
return backend, f"命令前缀 {prefix}"
|
||||
|
||||
# 2. 检查命令参数
|
||||
if user_backend and user_backend in VALID_BACKENDS:
|
||||
return user_backend, f"命令参数 {user_backend}"
|
||||
|
||||
# 3. 使用配置文件默认值
|
||||
return self._get_default_backend(), "配置文件"
|
||||
|
||||
async def execute(self) -> Tuple[bool, str, bool]:
|
||||
"""执行TTS命令"""
|
||||
try:
|
||||
text = self.matched_groups.get("text", "").strip()
|
||||
voice = self.matched_groups.get("voice", "")
|
||||
user_backend = self.matched_groups.get("backend", "")
|
||||
|
||||
# 处理帮助命令
|
||||
if text.lower() == "help":
|
||||
await self._send_help()
|
||||
return True, "显示帮助信息", True
|
||||
|
||||
if not text:
|
||||
await self._send_error("请输入要转换为语音的文本内容")
|
||||
return False, "缺少文本内容", True
|
||||
|
||||
# 确定后端
|
||||
backend, backend_source = self._determine_backend(user_backend)
|
||||
|
||||
# 清理文本
|
||||
max_length = self.get_config(ConfigKeys.GENERAL_MAX_TEXT_LENGTH, 500)
|
||||
clean_text = TTSTextUtils.clean_text(text, max_length)
|
||||
|
||||
if not clean_text:
|
||||
await self._send_error("文本处理后为空")
|
||||
return False, "文本处理后为空", True
|
||||
|
||||
# 检查长度限制
|
||||
if len(clean_text) > max_length:
|
||||
await self.send_text(
|
||||
f"文本过长({len(clean_text)}字符),"
|
||||
f"超过语音合成限制({max_length}字符),"
|
||||
f"已改为文字发送。\n\n{clean_text}"
|
||||
)
|
||||
return True, "文本过长,已改为文字发送", True
|
||||
|
||||
logger.info(f"{self.log_prefix} 执行TTS命令 (后端: {backend} [来源: {backend_source}], 音色: {voice})")
|
||||
|
||||
# 执行后端
|
||||
# 对于 CosyVoice 和豆包,voice 参数实际上是情感/方言
|
||||
if backend in ["cosyvoice", "doubao"]:
|
||||
result = await self._execute_backend(backend, clean_text, voice="", emotion=voice)
|
||||
else:
|
||||
result = await self._execute_backend(backend, clean_text, voice)
|
||||
|
||||
if not result.success:
|
||||
await self._send_error(f"语音合成失败: {result.message}")
|
||||
|
||||
return result.success, result.message, True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"{self.log_prefix} TTS命令执行出错: {e}")
|
||||
await self._send_error(f"语音合成出错: {e}")
|
||||
return False, f"执行出错: {e}", True
|
||||
|
||||
|
||||
class TTSInstructCommand(BaseCommand):
|
||||
"""生成 CustomVoice instruct(调试/预览用)"""
|
||||
|
||||
command_name = "tts_instruct_command"
|
||||
command_description = "根据待朗读文本生成 CustomVoice 的 instruct(情绪/语速/停顿)"
|
||||
command_pattern = r"^/tts_instruct\\s+(?P<text>.+?)$"
|
||||
command_help = "用法:/tts_instruct <文本>"
|
||||
command_examples = [
|
||||
"/tts_instruct 早上好,今天也要加油。",
|
||||
"/tts_instruct えっ?本当にそうなの?",
|
||||
]
|
||||
intercept_message = True
|
||||
|
||||
async def execute(self) -> Tuple[bool, str, int]:
|
||||
try:
|
||||
text = (self.matched_groups.get("text") or "").strip()
|
||||
if not text:
|
||||
await self.send_text("请输入要生成 instruct 的文本")
|
||||
return False, "缺少文本", 2
|
||||
|
||||
# Use the same logic as ComfyUI backend auto_instruct.
|
||||
from .backends.comfyui import ComfyUIBackend
|
||||
from .utils.text import TTSTextUtils
|
||||
|
||||
detected = TTSTextUtils.detect_language(text)
|
||||
chat_stream = getattr(self.message, "chat_stream", None)
|
||||
chat_id = getattr(chat_stream, "stream_id", None) if chat_stream else None
|
||||
|
||||
backend = ComfyUIBackend(self.get_config, log_prefix=self.log_prefix)
|
||||
instruct = await backend._infer_instruct(
|
||||
text=text,
|
||||
detected_lang=detected,
|
||||
chat_stream=chat_stream,
|
||||
chat_id=chat_id,
|
||||
style_name="__command__",
|
||||
)
|
||||
|
||||
if not instruct:
|
||||
await self.send_text("instruct 生成失败(可能未启用 comfyui.auto_instruct_enabled 或 LLM 不可用)")
|
||||
return False, "instruct 生成失败", 2
|
||||
|
||||
await self.send_text(instruct)
|
||||
return True, "instruct 已生成", 2
|
||||
except Exception as e:
|
||||
await self.send_text(f"instruct 生成异常: {e}")
|
||||
return False, str(e), 2
|
||||
|
||||
|
||||
@register_plugin
|
||||
class UnifiedTTSPlugin(BasePlugin):
|
||||
"""统一TTS语音合成插件 - 支持多后端的文本转语音插件"""
|
||||
|
||||
plugin_name = "tts_voice_plugin"
|
||||
plugin_description = "统一TTS语音合成插件,支持AI Voice、GSV2P、GPT-SoVITS、豆包语音多种后端"
|
||||
plugin_version = "3.2.3"
|
||||
plugin_author = "靓仔"
|
||||
enable_plugin = True
|
||||
config_file_name = "config.toml"
|
||||
dependencies = []
|
||||
python_dependencies = ["aiohttp"]
|
||||
|
||||
config_section_descriptions = {
|
||||
"plugin": "插件基本配置",
|
||||
"general": "通用设置",
|
||||
"components": "组件启用控制",
|
||||
"probability": "概率控制配置",
|
||||
"ai_voice": "AI Voice后端配置",
|
||||
"gsv2p": "GSV2P后端配置",
|
||||
"gpt_sovits": "GPT-SoVITS后端配置",
|
||||
"doubao": "豆包语音后端配置",
|
||||
"cosyvoice": "CosyVoice后端配置",
|
||||
"comfyui": "ComfyUI工作流API后端配置"
|
||||
}
|
||||
|
||||
config_schema = {
|
||||
"plugin": {
|
||||
"enabled": ConfigField(type=bool, default=True, description="是否启用插件"),
|
||||
"config_version": ConfigField(type=str, default="3.2.3", description="配置文件版本")
|
||||
},
|
||||
"general": {
|
||||
"default_backend": ConfigField(
|
||||
type=str, default="cosyvoice",
|
||||
description="默认TTS后端 (ai_voice/gsv2p/gpt_sovits/doubao/cosyvoice/comfyui/comfyui_voiceclone/comfyui_customvoice)"
|
||||
),
|
||||
"timeout": ConfigField(type=int, default=60, description="请求超时时间(秒)"),
|
||||
"max_text_length": ConfigField(
|
||||
type=int, default=200,
|
||||
description="最大文本长度(该限制会在调用LLM时注入到prompt中,让LLM直接生成符合长度的回复,而不是被动截断)"
|
||||
),
|
||||
"use_replyer_rewrite": ConfigField(
|
||||
type=bool, default=True,
|
||||
description="是否使用replyer润色语音内容"
|
||||
),
|
||||
"audio_output_dir": ConfigField(
|
||||
type=str, default="",
|
||||
description="音频文件输出目录(支持相对路径和绝对路径,留空使用项目根目录)"
|
||||
),
|
||||
"use_base64_audio": ConfigField(
|
||||
type=bool, default=True,
|
||||
description="是否使用base64编码发送音频(备选方案)"
|
||||
),
|
||||
"split_sentences": ConfigField(
|
||||
type=bool, default=True,
|
||||
description="是否分段发送语音(每句话单独发送一条语音,避免长语音播放问题)"
|
||||
),
|
||||
"split_delay": ConfigField(
|
||||
type=float, default=0.3,
|
||||
description="分段发送时每条语音之间的延迟(秒)"
|
||||
),
|
||||
"split_min_total_chars": ConfigField(
|
||||
type=int, default=120,
|
||||
description="自动分段启用阈值:文本长度小于该值时不分段(避免短句被切成多段)",
|
||||
),
|
||||
"split_min_sentence_chars": ConfigField(
|
||||
type=int, default=6,
|
||||
description="句子最小长度:过短片段会合并到前一句(用于减少碎片段)",
|
||||
),
|
||||
"split_max_segments": ConfigField(
|
||||
type=int, default=3,
|
||||
description="自动分段最大段数(避免刷屏式多段语音)。0 表示不限制。",
|
||||
),
|
||||
"split_chunk_chars": ConfigField(
|
||||
type=int, default=110,
|
||||
description="自动分段打包目标长度(字符)。用于把多句合并成更少段。",
|
||||
),
|
||||
"send_error_messages": ConfigField(
|
||||
type=bool, default=True,
|
||||
description="是否发送错误提示消息(关闭后语音合成失败时不会发送错误信息给用户)"
|
||||
)
|
||||
},
|
||||
"components": {
|
||||
"action_enabled": ConfigField(type=bool, default=True, description="是否启用Action组件"),
|
||||
"command_enabled": ConfigField(type=bool, default=True, description="是否启用Command组件"),
|
||||
"instruct_command_enabled": ConfigField(type=bool, default=True, description="是否启用instruct调试命令组件(/tts_instruct)")
|
||||
},
|
||||
"probability": {
|
||||
"enabled": ConfigField(type=bool, default=False, description="是否启用概率控制"),
|
||||
"base_probability": ConfigField(type=float, default=1.0, description="基础触发概率"),
|
||||
"keyword_force_trigger": ConfigField(type=bool, default=True, description="关键词强制触发"),
|
||||
"force_keywords": ConfigField(
|
||||
type=list,
|
||||
default=["一定要用语音", "必须语音", "语音回复我", "务必用语音"],
|
||||
description="强制触发关键词"
|
||||
)
|
||||
},
|
||||
"ai_voice": {
|
||||
"default_character": ConfigField(
|
||||
type=str,
|
||||
default="邻家小妹",
|
||||
description="默认音色(可选:小新、猴哥、四郎、东北老妹儿、广西大表哥、妲己、霸道总裁、酥心御姐、说书先生、憨憨小弟、憨厚老哥、吕布、元气少女、文艺少女、磁性大叔、邻家小妹、低沉男声、傲娇少女、爹系男友、暖心姐姐、温柔妹妹、书香少女)"
|
||||
)
|
||||
},
|
||||
"gsv2p": {
|
||||
"api_url": ConfigField(
|
||||
type=str, default="https://gsv2p.acgnai.top/v1/audio/speech",
|
||||
description="GSV2P API地址"
|
||||
),
|
||||
"api_token": ConfigField(type=str, default="", description="API认证Token"),
|
||||
"default_voice": ConfigField(type=str, default="原神-中文-派蒙_ZH", description="默认音色"),
|
||||
"timeout": ConfigField(type=int, default=120, description="API请求超时(秒)"),
|
||||
"model": ConfigField(type=str, default="tts-v4", description="TTS模型"),
|
||||
"response_format": ConfigField(type=str, default="wav", description="音频格式"),
|
||||
"speed": ConfigField(type=float, default=1.0, description="语音速度")
|
||||
},
|
||||
"gpt_sovits": {
|
||||
"server": ConfigField(
|
||||
type=str, default="http://127.0.0.1:9880",
|
||||
description="GPT-SoVITS服务地址"
|
||||
),
|
||||
"styles": ConfigField(
|
||||
type=list,
|
||||
default=[
|
||||
{
|
||||
"name": "default",
|
||||
"refer_wav": "",
|
||||
"prompt_text": "",
|
||||
"prompt_language": "zh",
|
||||
"gpt_weights": "",
|
||||
"sovits_weights": ""
|
||||
}
|
||||
],
|
||||
description="语音风格配置",
|
||||
item_type="object",
|
||||
item_fields={
|
||||
"name": {"type": "string", "label": "风格名称", "required": True},
|
||||
"refer_wav": {"type": "string", "label": "参考音频路径", "required": True},
|
||||
"prompt_text": {"type": "string", "label": "参考文本", "required": True},
|
||||
"prompt_language": {"type": "string", "label": "参考语言", "default": "zh"},
|
||||
"gpt_weights": {"type": "string", "label": "GPT模型权重路径(可选)", "required": False},
|
||||
"sovits_weights": {"type": "string", "label": "SoVITS模型权重路径(可选)", "required": False}
|
||||
}
|
||||
)
|
||||
},
|
||||
"doubao": {
|
||||
"api_url": ConfigField(
|
||||
type=str,
|
||||
default="https://openspeech.bytedance.com/api/v3/tts/unidirectional",
|
||||
description="豆包语音API地址"
|
||||
),
|
||||
"app_id": ConfigField(type=str, default="", description="豆包APP ID"),
|
||||
"access_key": ConfigField(type=str, default="", description="豆包Access Key"),
|
||||
"resource_id": ConfigField(type=str, default="seed-tts-2.0", description="豆包Resource ID"),
|
||||
"default_voice": ConfigField(
|
||||
type=str, default="zh_female_vv_uranus_bigtts",
|
||||
description="默认音色"
|
||||
),
|
||||
"timeout": ConfigField(type=int, default=60, description="API请求超时(秒)"),
|
||||
"audio_format": ConfigField(type=str, default="wav", description="音频格式"),
|
||||
"sample_rate": ConfigField(type=int, default=24000, description="采样率"),
|
||||
"bitrate": ConfigField(type=int, default=128000, description="比特率"),
|
||||
"speed": ConfigField(type=float, default=None, description="语音速度(可选)"),
|
||||
"volume": ConfigField(type=float, default=None, description="音量(可选)"),
|
||||
"context_texts": ConfigField(
|
||||
type=list, default=None,
|
||||
description="上下文辅助文本(可选,仅豆包2.0模型)"
|
||||
)
|
||||
},
|
||||
"cosyvoice": {
|
||||
"gradio_url": ConfigField(
|
||||
type=str,
|
||||
default="https://funaudiollm-fun-cosyvoice3-0-5b.ms.show/",
|
||||
description="Gradio API地址"
|
||||
),
|
||||
"default_mode": ConfigField(
|
||||
type=str,
|
||||
default="3s极速复刻",
|
||||
description="推理模式(3s极速复刻/自然语言控制)"
|
||||
),
|
||||
"default_instruct": ConfigField(
|
||||
type=str,
|
||||
default="You are a helpful assistant. 请用广东话表达。<|endofprompt|>",
|
||||
description="默认指令(用于自然语言控制模式)"
|
||||
),
|
||||
"reference_audio": ConfigField(
|
||||
type=str,
|
||||
default="",
|
||||
description="参考音频路径(用于3s极速复刻模式)"
|
||||
),
|
||||
"prompt_text": ConfigField(
|
||||
type=str,
|
||||
default="",
|
||||
description="提示文本(用于3s极速复刻模式)"
|
||||
),
|
||||
"timeout": ConfigField(type=int, default=300, description="API请求超时(秒)"),
|
||||
"audio_format": ConfigField(type=str, default="wav", description="音频格式")
|
||||
},
|
||||
"comfyui": {
|
||||
"server": ConfigField(
|
||||
type=str,
|
||||
default="http://127.0.0.1:8188",
|
||||
description="ComfyUI 服务地址(示例: http://127.0.0.1:8188)",
|
||||
),
|
||||
"input_dir": ConfigField(
|
||||
type=str,
|
||||
default="/Users/xenon/Downloads/seiun_tts/ComfyUI/ComfyUI/input",
|
||||
description="ComfyUI input 目录(用于放参考音频,LoadAudio 会从这里读)",
|
||||
),
|
||||
"timeout": ConfigField(type=int, default=120, description="ComfyUI 请求超时(秒)"),
|
||||
"audio_quality": ConfigField(
|
||||
type=str,
|
||||
default="128k",
|
||||
description="输出 MP3 质量(SaveAudioMP3 quality: V0/128k/320k)",
|
||||
),
|
||||
"mlx_python": ConfigField(
|
||||
type=str,
|
||||
default="/Users/xenon/Downloads/seiun_tts/qwen3-tts-apple-silicon/.venv/bin/python",
|
||||
description="MLX Qwen3-TTS venv python 路径(用于 ComfyUI-MLX 节点子进程)",
|
||||
),
|
||||
"mlx_cli": ConfigField(
|
||||
type=str,
|
||||
default="/Users/xenon/Downloads/seiun_tts/qwen3-tts-apple-silicon/mlx_voice_clone_cli.py",
|
||||
description="mlx_voice_clone_cli.py 路径",
|
||||
),
|
||||
"default_style": ConfigField(type=str, default="default", description="默认风格名称"),
|
||||
"voiceclone_default_style": ConfigField(
|
||||
type=str,
|
||||
default="",
|
||||
description="VoiceClone 专用默认风格名称(用于 comfyui_voiceclone 后端;留空则回退到 default_style)",
|
||||
),
|
||||
"customvoice_default_style": ConfigField(
|
||||
type=str,
|
||||
default="",
|
||||
description="CustomVoice 专用默认风格名称(用于 comfyui_customvoice 后端;留空则回退到 default_style)",
|
||||
),
|
||||
"auto_instruct_enabled": ConfigField(
|
||||
type=bool,
|
||||
default=False,
|
||||
description="是否启用 CustomVoice instruct 自动推断(使用 MaiBot 的 LLM 接口)",
|
||||
),
|
||||
"auto_instruct_max_chars": ConfigField(
|
||||
type=int,
|
||||
default=120,
|
||||
description="自动推断 instruct 的最大长度(字符)。建议 80-160,太短会导致情绪/表演提示被截断。",
|
||||
),
|
||||
"auto_instruct_prompt": ConfigField(
|
||||
type=str,
|
||||
default="",
|
||||
description="自定义 instruct 推断 prompt(留空使用内置模板)",
|
||||
),
|
||||
"auto_instruct_base_tone": ConfigField(
|
||||
type=str,
|
||||
default="",
|
||||
description="自动推断 instruct 时固定附加的基调描述(会作为 `基调=...;` 前缀插入;会自动清洗为单行,且不会包含 `;`/`=`)",
|
||||
),
|
||||
"pause_linebreak": ConfigField(type=float, default=0.0, description="换行停顿(秒)"),
|
||||
"period_pause": ConfigField(type=float, default=0.0, description="句号停顿(秒)"),
|
||||
"comma_pause": ConfigField(type=float, default=0.0, description="逗号停顿(秒)"),
|
||||
"question_pause": ConfigField(type=float, default=0.0, description="问号停顿(秒)"),
|
||||
"hyphen_pause": ConfigField(type=float, default=0.0, description="连字符停顿(秒)"),
|
||||
"styles": ConfigField(
|
||||
type=list,
|
||||
default=[
|
||||
{
|
||||
"name": "default",
|
||||
"refer_wav": "",
|
||||
"prompt_text": "",
|
||||
"language": "",
|
||||
"model_choice": "1.7B",
|
||||
"precision": "bf16",
|
||||
"seed": 0,
|
||||
"max_new_tokens": 2048,
|
||||
"top_p": 0.8,
|
||||
"top_k": 20,
|
||||
"temperature": 1.0,
|
||||
"repetition_penalty": 1.05,
|
||||
}
|
||||
],
|
||||
description="ComfyUI VoiceClone 风格配置(参考音频+逐字稿)",
|
||||
item_type="object",
|
||||
item_fields={
|
||||
"name": {"type": "string", "label": "风格名称", "required": True},
|
||||
"mode": {"type": "string", "label": "模式(voice_clone/custom_voice)", "required": False},
|
||||
"refer_wav": {"type": "string", "label": "参考音频路径", "required": True},
|
||||
"prompt_text": {"type": "string", "label": "参考文本(逐字稿)", "required": True},
|
||||
"language": {"type": "string", "label": "语言(可选: Auto/Chinese/English/...) ", "required": False},
|
||||
"model_choice": {"type": "string", "label": "模型(0.6B/1.7B)", "required": False},
|
||||
"precision": {"type": "string", "label": "精度(bf16/fp32)", "required": False},
|
||||
"model_path": {"type": "string", "label": "CustomVoice模型路径", "required": False},
|
||||
"speaker": {"type": "string", "label": "CustomVoice说话人", "required": False},
|
||||
"instruct": {"type": "string", "label": "CustomVoice指令(或__AUTO__)", "required": False},
|
||||
"auto_instruct": {"type": "boolean", "label": "按style启用auto_instruct", "required": False},
|
||||
"speed": {"type": "number", "label": "speed", "required": False},
|
||||
"seed": {"type": "number", "label": "seed", "required": False},
|
||||
"max_new_tokens": {"type": "number", "label": "max_new_tokens", "required": False},
|
||||
"top_p": {"type": "number", "label": "top_p", "required": False},
|
||||
"top_k": {"type": "number", "label": "top_k", "required": False},
|
||||
"temperature": {"type": "number", "label": "temperature", "required": False},
|
||||
"repetition_penalty": {"type": "number", "label": "repetition_penalty", "required": False},
|
||||
},
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
def get_plugin_components(self) -> List[Tuple[ComponentInfo, Type]]:
|
||||
"""返回插件组件列表"""
|
||||
components = []
|
||||
|
||||
try:
|
||||
action_enabled = self.get_config(ConfigKeys.COMPONENTS_ACTION_ENABLED, True)
|
||||
command_enabled = self.get_config(ConfigKeys.COMPONENTS_COMMAND_ENABLED, True)
|
||||
instruct_enabled = self.get_config(ConfigKeys.COMPONENTS_INSTRUCT_COMMAND_ENABLED, True)
|
||||
except AttributeError:
|
||||
action_enabled = True
|
||||
command_enabled = True
|
||||
instruct_enabled = True
|
||||
|
||||
if action_enabled:
|
||||
components.append((UnifiedTTSAction.get_action_info(), UnifiedTTSAction))
|
||||
|
||||
if command_enabled:
|
||||
components.append((UnifiedTTSCommand.get_command_info(), UnifiedTTSCommand))
|
||||
|
||||
if instruct_enabled:
|
||||
components.append((TTSInstructCommand.get_command_info(), TTSInstructCommand))
|
||||
|
||||
return components
|
||||
|
|
@ -0,0 +1,12 @@
|
|||
"""
|
||||
TTS工具模块
|
||||
"""
|
||||
|
||||
import sys
|
||||
sys.dont_write_bytecode = True
|
||||
|
||||
from .text import TTSTextUtils
|
||||
from .session import TTSSessionManager
|
||||
from .file import TTSFileManager
|
||||
|
||||
__all__ = ["TTSTextUtils", "TTSSessionManager", "TTSFileManager"]
|
||||
|
|
@ -0,0 +1,280 @@
|
|||
"""
|
||||
文件操作工具类
|
||||
提供异步文件操作、临时文件管理等功能
|
||||
"""
|
||||
|
||||
import os
|
||||
import uuid
|
||||
import tempfile
|
||||
import asyncio
|
||||
import base64
|
||||
from typing import Optional
|
||||
from src.common.logger import get_logger
|
||||
|
||||
logger = get_logger("tts_file_manager")
|
||||
|
||||
# 音频数据最小有效大小(字节)
|
||||
MIN_AUDIO_SIZE = 100
|
||||
|
||||
|
||||
class TTSFileManager:
|
||||
"""
|
||||
TTS文件管理器
|
||||
|
||||
提供:
|
||||
- 临时文件创建(避免并发冲突)
|
||||
- 异步文件写入
|
||||
- 自动清理
|
||||
- 相对路径和绝对路径支持
|
||||
"""
|
||||
|
||||
# 临时文件目录(兼容旧代码)
|
||||
_temp_dir: Optional[str] = None
|
||||
|
||||
# 项目根目录(用于解析相对路径)
|
||||
_project_root: Optional[str] = None
|
||||
|
||||
@classmethod
|
||||
def set_project_root(cls, root_path: str):
|
||||
"""设置项目根目录"""
|
||||
if os.path.isdir(root_path):
|
||||
cls._project_root = root_path
|
||||
logger.debug(f"设置项目根目录: {root_path}")
|
||||
else:
|
||||
logger.warning(f"项目根目录不存在: {root_path}")
|
||||
|
||||
@classmethod
|
||||
def get_project_root(cls) -> str:
|
||||
"""获取项目根目录"""
|
||||
if cls._project_root is None:
|
||||
# 尝试从当前文件位置推断项目根目录
|
||||
current_file = os.path.abspath(__file__)
|
||||
# 假设结构是: project_root/plugins/tts_voice_plugin/utils/file.py
|
||||
cls._project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(current_file))))
|
||||
logger.debug(f"自动推断项目根目录: {cls._project_root}")
|
||||
return cls._project_root
|
||||
|
||||
@classmethod
|
||||
def resolve_path(cls, path: str) -> str:
|
||||
"""
|
||||
解析路径(支持相对路径和绝对路径)
|
||||
|
||||
Args:
|
||||
path: 路径字符串
|
||||
|
||||
Returns:
|
||||
解析后的绝对路径
|
||||
"""
|
||||
if os.path.isabs(path):
|
||||
# 已经是绝对路径
|
||||
return path
|
||||
else:
|
||||
# 相对路径,相对于项目根目录
|
||||
return os.path.join(cls.get_project_root(), path)
|
||||
|
||||
@classmethod
|
||||
def ensure_dir(cls, dir_path: str) -> bool:
|
||||
"""
|
||||
确保目录存在,不存在则创建
|
||||
|
||||
Args:
|
||||
dir_path: 目录路径
|
||||
|
||||
Returns:
|
||||
是否成功
|
||||
"""
|
||||
try:
|
||||
os.makedirs(dir_path, exist_ok=True)
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"创建目录失败: {dir_path}, 错误: {e}")
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def get_temp_dir(cls) -> str:
|
||||
"""
|
||||
获取临时文件目录(已废弃,保留兼容性)
|
||||
|
||||
Returns:
|
||||
临时目录路径
|
||||
"""
|
||||
if cls._temp_dir is None:
|
||||
cls._temp_dir = tempfile.gettempdir()
|
||||
return cls._temp_dir
|
||||
|
||||
@classmethod
|
||||
def set_temp_dir(cls, path: str):
|
||||
"""
|
||||
设置临时文件目录(已废弃,保留兼容性)
|
||||
|
||||
Args:
|
||||
path: 目录路径
|
||||
"""
|
||||
if os.path.isdir(path):
|
||||
cls._temp_dir = path
|
||||
else:
|
||||
raise ValueError(f"目录不存在: {path}")
|
||||
|
||||
@classmethod
|
||||
def generate_temp_path(cls, prefix: str = "tts", suffix: str = ".mp3", output_dir: str = "") -> str:
|
||||
"""
|
||||
生成唯一的临时文件路径
|
||||
|
||||
Args:
|
||||
prefix: 文件名前缀
|
||||
suffix: 文件扩展名
|
||||
output_dir: 输出目录(支持相对路径和绝对路径,留空使用项目根目录)
|
||||
|
||||
Returns:
|
||||
临时文件的绝对路径
|
||||
"""
|
||||
# 确定输出目录
|
||||
if not output_dir:
|
||||
# 默认使用项目根目录
|
||||
resolved_dir = cls.get_project_root()
|
||||
else:
|
||||
# 解析用户配置的路径
|
||||
resolved_dir = cls.resolve_path(output_dir)
|
||||
# 确保目录存在
|
||||
if not cls.ensure_dir(resolved_dir):
|
||||
# 如果创建失败,降级到项目根目录
|
||||
logger.warning(f"无法创建输出目录 {resolved_dir},使用项目根目录")
|
||||
resolved_dir = cls.get_project_root()
|
||||
|
||||
# 生成唯一文件名
|
||||
unique_id = uuid.uuid4().hex[:12]
|
||||
filename = f"{prefix}_{unique_id}{suffix}"
|
||||
return os.path.join(resolved_dir, filename)
|
||||
|
||||
@classmethod
|
||||
async def write_audio_async(cls, path: str, data: bytes) -> bool:
|
||||
"""
|
||||
异步写入音频数据到文件
|
||||
|
||||
Args:
|
||||
path: 文件路径
|
||||
data: 音频二进制数据
|
||||
|
||||
Returns:
|
||||
是否写入成功
|
||||
"""
|
||||
try:
|
||||
# 使用线程池执行同步文件写入,避免阻塞事件循环
|
||||
loop = asyncio.get_event_loop()
|
||||
await loop.run_in_executor(None, cls._write_file_sync, path, data)
|
||||
logger.debug(f"音频文件写入成功: {path} ({len(data)} bytes)")
|
||||
return True
|
||||
except IOError as e:
|
||||
logger.error(f"写入音频文件失败: {path}, 错误: {e}")
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.error(f"写入音频文件时发生未知错误: {path}, 错误: {e}")
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def _write_file_sync(path: str, data: bytes):
|
||||
"""同步写入文件(内部方法)"""
|
||||
with open(path, "wb") as f:
|
||||
f.write(data)
|
||||
|
||||
@classmethod
|
||||
def write_audio_sync(cls, path: str, data: bytes) -> bool:
|
||||
"""
|
||||
同步写入音频数据到文件
|
||||
|
||||
Args:
|
||||
path: 文件路径
|
||||
data: 音频二进制数据
|
||||
|
||||
Returns:
|
||||
是否写入成功
|
||||
"""
|
||||
try:
|
||||
cls._write_file_sync(path, data)
|
||||
logger.debug(f"音频文件写入成功: {path} ({len(data)} bytes)")
|
||||
return True
|
||||
except IOError as e:
|
||||
logger.error(f"写入音频文件失败: {path}, 错误: {e}")
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.error(f"写入音频文件时发生未知错误: {path}, 错误: {e}")
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def cleanup_file(cls, path: str, silent: bool = True) -> bool:
|
||||
"""
|
||||
清理临时文件
|
||||
|
||||
Args:
|
||||
path: 文件路径
|
||||
silent: 是否静默处理错误
|
||||
|
||||
Returns:
|
||||
是否清理成功
|
||||
"""
|
||||
try:
|
||||
if path and os.path.exists(path):
|
||||
os.remove(path)
|
||||
logger.debug(f"临时文件已清理: {path}")
|
||||
return True
|
||||
return False
|
||||
except Exception as e:
|
||||
if not silent:
|
||||
logger.warning(f"清理临时文件失败: {path}, 错误: {e}")
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
async def cleanup_file_async(cls, path: str, delay: float = 0) -> bool:
|
||||
"""
|
||||
异步清理临时文件(可延迟)
|
||||
|
||||
Args:
|
||||
path: 文件路径
|
||||
delay: 延迟秒数
|
||||
|
||||
Returns:
|
||||
是否清理成功
|
||||
"""
|
||||
if delay > 0:
|
||||
await asyncio.sleep(delay)
|
||||
loop = asyncio.get_event_loop()
|
||||
return await loop.run_in_executor(None, cls.cleanup_file, path, True)
|
||||
|
||||
@classmethod
|
||||
def validate_audio_data(cls, data: bytes, min_size: int = None) -> tuple:
|
||||
"""
|
||||
验证音频数据有效性
|
||||
|
||||
Args:
|
||||
data: 音频二进制数据
|
||||
min_size: 最小有效大小
|
||||
|
||||
Returns:
|
||||
(is_valid, error_message)
|
||||
"""
|
||||
if data is None:
|
||||
return False, "音频数据为空"
|
||||
|
||||
min_size = min_size or MIN_AUDIO_SIZE
|
||||
|
||||
if len(data) < min_size:
|
||||
return False, f"音频数据过小({len(data)}字节 < {min_size}字节)"
|
||||
|
||||
return True, ""
|
||||
|
||||
@classmethod
|
||||
def audio_to_base64(cls, data: bytes) -> str:
|
||||
"""
|
||||
将音频数据转换为base64字符串
|
||||
|
||||
Args:
|
||||
data: 音频二进制数据
|
||||
|
||||
Returns:
|
||||
base64编码的字符串
|
||||
"""
|
||||
try:
|
||||
return base64.b64encode(data).decode('utf-8')
|
||||
except Exception as e:
|
||||
logger.error(f"音频数据转base64失败: {e}")
|
||||
return ""
|
||||
|
|
@ -0,0 +1,186 @@
|
|||
"""
|
||||
HTTP Session 管理器
|
||||
提供连接池复用,避免每次请求创建新连接
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import aiohttp
|
||||
from typing import Optional, Dict, Any
|
||||
from contextlib import asynccontextmanager
|
||||
from src.common.logger import get_logger
|
||||
|
||||
logger = get_logger("tts_session_manager")
|
||||
|
||||
|
||||
class TTSSessionManager:
|
||||
"""
|
||||
TTS HTTP Session 管理器
|
||||
|
||||
提供:
|
||||
- 连接池复用
|
||||
- 自动超时管理
|
||||
- 优雅关闭
|
||||
"""
|
||||
|
||||
_instance: Optional["TTSSessionManager"] = None
|
||||
_lock = asyncio.Lock()
|
||||
|
||||
def __init__(self):
|
||||
self._sessions: Dict[str, aiohttp.ClientSession] = {}
|
||||
self._default_timeout = 60
|
||||
|
||||
@classmethod
|
||||
async def get_instance(cls) -> "TTSSessionManager":
|
||||
"""获取单例实例"""
|
||||
if cls._instance is None:
|
||||
async with cls._lock:
|
||||
if cls._instance is None:
|
||||
cls._instance = cls()
|
||||
return cls._instance
|
||||
|
||||
async def get_session(
|
||||
self,
|
||||
backend_name: str = "default",
|
||||
timeout: int = None
|
||||
) -> aiohttp.ClientSession:
|
||||
"""
|
||||
获取或创建 HTTP Session
|
||||
|
||||
Args:
|
||||
backend_name: 后端名称,用于区分不同的session
|
||||
timeout: 超时时间(秒)
|
||||
|
||||
Returns:
|
||||
aiohttp.ClientSession 实例
|
||||
"""
|
||||
if backend_name not in self._sessions or self._sessions[backend_name].closed:
|
||||
timeout_val = timeout or self._default_timeout
|
||||
connector = aiohttp.TCPConnector(
|
||||
limit=10, # 每个主机最大连接数
|
||||
limit_per_host=5,
|
||||
ttl_dns_cache=300, # DNS缓存5分钟
|
||||
force_close=True, # 禁用连接复用,修复GSV2P等API的兼容性问题
|
||||
)
|
||||
self._sessions[backend_name] = aiohttp.ClientSession(
|
||||
connector=connector,
|
||||
timeout=aiohttp.ClientTimeout(total=timeout_val)
|
||||
)
|
||||
logger.debug(f"创建新的HTTP Session: {backend_name}")
|
||||
|
||||
return self._sessions[backend_name]
|
||||
|
||||
async def close_session(self, backend_name: str = None):
|
||||
"""
|
||||
关闭指定或所有 Session
|
||||
|
||||
Args:
|
||||
backend_name: 后端名称,为None时关闭所有
|
||||
"""
|
||||
if backend_name:
|
||||
if backend_name in self._sessions:
|
||||
await self._sessions[backend_name].close()
|
||||
del self._sessions[backend_name]
|
||||
logger.debug(f"关闭HTTP Session: {backend_name}")
|
||||
else:
|
||||
for name, session in self._sessions.items():
|
||||
if not session.closed:
|
||||
await session.close()
|
||||
logger.debug(f"关闭HTTP Session: {name}")
|
||||
self._sessions.clear()
|
||||
|
||||
@asynccontextmanager
|
||||
async def post(
|
||||
self,
|
||||
url: str,
|
||||
json: Dict[str, Any] = None,
|
||||
headers: Dict[str, str] = None,
|
||||
data: Any = None,
|
||||
backend_name: str = "default",
|
||||
timeout: int = None
|
||||
):
|
||||
"""
|
||||
发送POST请求(异步上下文管理器)
|
||||
|
||||
Args:
|
||||
url: 请求URL
|
||||
json: JSON请求体
|
||||
headers: 请求头
|
||||
data: 表单数据
|
||||
backend_name: 后端名称
|
||||
timeout: 超时时间
|
||||
|
||||
Yields:
|
||||
aiohttp.ClientResponse
|
||||
|
||||
Usage:
|
||||
async with session_manager.post(url, json=data) as response:
|
||||
...
|
||||
"""
|
||||
session = await self.get_session(backend_name, timeout)
|
||||
|
||||
# 如果指定了不同的超时时间,创建新的超时对象
|
||||
req_timeout = None
|
||||
if timeout:
|
||||
req_timeout = aiohttp.ClientTimeout(total=timeout)
|
||||
|
||||
response = await session.post(
|
||||
url,
|
||||
json=json,
|
||||
headers=headers,
|
||||
data=data,
|
||||
timeout=req_timeout
|
||||
)
|
||||
try:
|
||||
yield response
|
||||
finally:
|
||||
response.release()
|
||||
|
||||
@asynccontextmanager
|
||||
async def get(
|
||||
self,
|
||||
url: str,
|
||||
headers: Dict[str, str] = None,
|
||||
params: Dict[str, Any] = None,
|
||||
backend_name: str = "default",
|
||||
timeout: int = None
|
||||
):
|
||||
"""
|
||||
发送GET请求(异步上下文管理器)
|
||||
|
||||
Args:
|
||||
url: 请求URL
|
||||
headers: 请求头
|
||||
params: URL参数
|
||||
backend_name: 后端名称
|
||||
timeout: 超时时间
|
||||
|
||||
Yields:
|
||||
aiohttp.ClientResponse
|
||||
|
||||
Usage:
|
||||
async with session_manager.get(url) as response:
|
||||
...
|
||||
"""
|
||||
session = await self.get_session(backend_name, timeout)
|
||||
|
||||
# 如果指定了不同的超时时间,创建新的超时对象
|
||||
req_timeout = None
|
||||
if timeout:
|
||||
req_timeout = aiohttp.ClientTimeout(total=timeout)
|
||||
|
||||
response = await session.get(
|
||||
url,
|
||||
headers=headers,
|
||||
params=params,
|
||||
timeout=req_timeout
|
||||
)
|
||||
try:
|
||||
yield response
|
||||
finally:
|
||||
response.release()
|
||||
|
||||
async def __aenter__(self):
|
||||
return self
|
||||
|
||||
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
||||
await self.close_session()
|
||||
|
|
@ -0,0 +1,181 @@
|
|||
"""
|
||||
文本处理工具类
|
||||
"""
|
||||
|
||||
import re
|
||||
from typing import Optional, List
|
||||
|
||||
|
||||
class TTSTextUtils:
|
||||
"""TTS文本处理工具类"""
|
||||
|
||||
# 网络用语替换映射
|
||||
NETWORK_SLANG_MAP = {
|
||||
'www': '哈哈哈',
|
||||
'hhh': '哈哈',
|
||||
'233': '哈哈',
|
||||
'666': '厉害',
|
||||
'88': '拜拜',
|
||||
'...': '。',
|
||||
'……': '。'
|
||||
}
|
||||
|
||||
# 需要移除的特殊字符正则
|
||||
SPECIAL_CHAR_PATTERN = re.compile(
|
||||
r'[^\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ffa-zA-Z0-9\s,。!?、;:()【】"\'.,!?;:()\[\]`-]'
|
||||
)
|
||||
|
||||
# 语言检测正则
|
||||
CHINESE_PATTERN = re.compile(r'[\u4e00-\u9fff]')
|
||||
ENGLISH_PATTERN = re.compile(r'[a-zA-Z]')
|
||||
JAPANESE_PATTERN = re.compile(r'[\u3040-\u309f\u30a0-\u30ff]')
|
||||
|
||||
@classmethod
|
||||
def clean_text(cls, text: str, max_length: int = 500) -> str:
|
||||
"""
|
||||
清理文本,移除特殊字符,替换网络用语
|
||||
|
||||
Args:
|
||||
text: 原始文本
|
||||
max_length: 最大长度限制(此参数已不用于硬截断,仅用于参考)
|
||||
|
||||
Returns:
|
||||
清理后的文本(不会硬截断,保留完整内容以便上层决策)
|
||||
"""
|
||||
if not text:
|
||||
return ""
|
||||
|
||||
# 注释掉文本清理功能,保留原始格式
|
||||
# 移除不支持的特殊字符
|
||||
# text = cls.SPECIAL_CHAR_PATTERN.sub('', text)
|
||||
|
||||
# 替换常见网络用语
|
||||
# for old, new in cls.NETWORK_SLANG_MAP.items():
|
||||
# text = text.replace(old, new)
|
||||
|
||||
return text.strip()
|
||||
|
||||
@classmethod
|
||||
def detect_language(cls, text: str) -> str:
|
||||
"""
|
||||
检测文本语言
|
||||
|
||||
Args:
|
||||
text: 待检测文本
|
||||
|
||||
Returns:
|
||||
语言代码 (zh/ja/en)
|
||||
"""
|
||||
if not text:
|
||||
return "zh"
|
||||
|
||||
chinese_chars = len(cls.CHINESE_PATTERN.findall(text))
|
||||
english_chars = len(cls.ENGLISH_PATTERN.findall(text))
|
||||
japanese_chars = len(cls.JAPANESE_PATTERN.findall(text))
|
||||
total_chars = chinese_chars + english_chars + japanese_chars
|
||||
|
||||
if total_chars == 0:
|
||||
return "zh"
|
||||
|
||||
chinese_ratio = chinese_chars / total_chars
|
||||
japanese_ratio = japanese_chars / total_chars
|
||||
english_ratio = english_chars / total_chars
|
||||
|
||||
if chinese_ratio > 0.3:
|
||||
return "zh"
|
||||
elif japanese_ratio > 0.3:
|
||||
return "ja"
|
||||
elif english_ratio > 0.8:
|
||||
return "en"
|
||||
else:
|
||||
return "zh"
|
||||
|
||||
@classmethod
|
||||
def resolve_voice_alias(
|
||||
cls,
|
||||
voice: Optional[str],
|
||||
alias_map: dict,
|
||||
default: str,
|
||||
prefix: str = ""
|
||||
) -> str:
|
||||
"""
|
||||
解析音色别名
|
||||
|
||||
Args:
|
||||
voice: 用户指定的音色
|
||||
alias_map: 别名映射表
|
||||
default: 默认音色
|
||||
prefix: 内部音色ID前缀(如 "lucy-voice-")
|
||||
|
||||
Returns:
|
||||
解析后的音色ID
|
||||
"""
|
||||
if not voice:
|
||||
voice = default
|
||||
|
||||
# 如果已经是内部ID格式,直接返回
|
||||
if prefix and voice.startswith(prefix):
|
||||
return voice
|
||||
|
||||
# 尝试从别名映射查找
|
||||
if voice in alias_map:
|
||||
return alias_map[voice]
|
||||
|
||||
# 尝试使用默认值的别名
|
||||
if default in alias_map:
|
||||
return alias_map[default]
|
||||
|
||||
return default
|
||||
|
||||
@classmethod
|
||||
def split_sentences(cls, text: str, min_length: int = 2) -> List[str]:
|
||||
"""
|
||||
将文本分割成句子
|
||||
|
||||
Args:
|
||||
text: 待分割文本
|
||||
min_length: 最小句子长度,过短的句子会合并到前一句
|
||||
|
||||
Returns:
|
||||
句子列表
|
||||
"""
|
||||
if not text:
|
||||
return []
|
||||
|
||||
# 使用中英文标点分割
|
||||
# 保留分隔符以便后续处理
|
||||
pattern = r'([。!?!?;;])'
|
||||
parts = re.split(pattern, text)
|
||||
|
||||
sentences = []
|
||||
current = ""
|
||||
|
||||
for i, part in enumerate(parts):
|
||||
if not part:
|
||||
continue
|
||||
|
||||
# 如果是标点符号,附加到当前句子
|
||||
if re.match(pattern, part):
|
||||
current += part
|
||||
else:
|
||||
# 如果当前句子不为空,先保存
|
||||
if current.strip():
|
||||
sentences.append(current.strip())
|
||||
current = part
|
||||
|
||||
# 处理最后一段
|
||||
if current.strip():
|
||||
sentences.append(current.strip())
|
||||
|
||||
# 合并过短的句子
|
||||
if min_length > 0 and len(sentences) > 1:
|
||||
merged = []
|
||||
for sent in sentences:
|
||||
if merged and len(sent) < min_length:
|
||||
# 合并到前一句
|
||||
merged[-1] += sent
|
||||
else:
|
||||
merged.append(sent)
|
||||
sentences = merged
|
||||
|
||||
return sentences
|
||||
Loading…
Reference in New Issue